SOLR-2585: Context-Sensitive Spelling Suggestions & Collations (spellcheck.alternativeTermCount & spellcheck.maxResultsForSuggest)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1341894 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
James Dyer 2012-05-23 15:27:29 +00:00
parent 88b483cbbd
commit ec1649300c
15 changed files with 360 additions and 123 deletions

View File

@ -302,6 +302,13 @@ New Features
choose the correct ContentStreamLoader based on Content-Type header. This
also deprecates the existing [Xml,JSON,CSV,Binary,Xslt]UpdateRequestHandler.
(ryan)
* SOLR-2585: Context-Sensitive Spelling Suggestions & Collations. This adds support
for the "spellcheck.alternativeTermCount" & "spellcheck.maxResultsForSuggest"
parameters, letting users receive suggestions even when all the queried terms
exist in the dictionary. This differs from "spellcheck.onlyMorePopular" in
that the suggestions need not consist entirely of terms with a greater document
frequency than the queried terms. (James Dyer)
Optimizations

View File

@ -399,6 +399,13 @@ public class ResponseBuilder
rsp.getResponseHeader().add("partialResults", Boolean.TRUE);
}
}
public long getNumberDocumentsFound() {
if (_responseDocs == null) {
return 0;
}
return _responseDocs.getNumFound();
}
public ScoreDoc getScoreDoc()
{

View File

@ -22,6 +22,7 @@ import java.io.StringReader;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.solr.client.solrj.response.SpellCheckResponse;
import org.apache.solr.common.params.ModifiableSolrParams;
@ -136,26 +137,47 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
if (tokens != null && tokens.isEmpty() == false) {
if (spellChecker != null) {
int count = params.getInt(SPELLCHECK_COUNT, 1);
boolean onlyMorePopular = params.getBool(SPELLCHECK_ONLY_MORE_POPULAR,
DEFAULT_ONLY_MORE_POPULAR);
boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS,
false);
NamedList response = new SimpleOrderedMap();
IndexReader reader = rb.req.getSearcher().getIndexReader();
boolean onlyMorePopular = params.getBool(SPELLCHECK_ONLY_MORE_POPULAR, DEFAULT_ONLY_MORE_POPULAR);
boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS, false);
boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE);
Integer alternativeTermCount = params.getInt(SpellingParams.SPELLCHECK_ALTERNATIVE_TERM_COUNT);
Integer maxResultsForSuggest = params.getInt(SpellingParams.SPELLCHECK_MAX_RESULTS_FOR_SUGGEST);
SolrParams customParams = getCustomParams(getDictionaryName(params), params);
SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults,
accuracy, customParams);
SpellingResult spellingResult = spellChecker.getSuggestions(options);
if (spellingResult != null) {
NamedList suggestions = toNamedList(shardRequest, spellingResult, q, extendedResults, collate);
if (collate) {
addCollationsToResponse(params, spellingResult, rb, q, suggestions);
}
response.add("suggestions", suggestions);
rb.rsp.add("spellcheck", response);
Integer hitsInteger = (Integer) rb.rsp.getToLog().get("hits");
long hits = 0;
if (hitsInteger == null) {
hits = rb.getNumberDocumentsFound();
} else {
hits = hitsInteger.longValue();
}
SpellingResult spellingResult = null;
if (maxResultsForSuggest == null || hits <= maxResultsForSuggest) {
SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
if (onlyMorePopular) {
suggestMode = SuggestMode.SUGGEST_MORE_POPULAR;
} else if (alternativeTermCount != null) {
suggestMode = SuggestMode.SUGGEST_ALWAYS;
}
IndexReader reader = rb.req.getSearcher().getIndexReader();
SpellingOptions options = new SpellingOptions(tokens, reader, count,
alternativeTermCount, suggestMode, extendedResults, accuracy,
customParams);
spellingResult = spellChecker.getSuggestions(options);
} else {
spellingResult = new SpellingResult();
}
boolean isCorrectlySpelled = hits > (maxResultsForSuggest==null ? 0 : maxResultsForSuggest);
NamedList suggestions = toNamedList(shardRequest, spellingResult, q,
extendedResults, collate, isCorrectlySpelled);
if (collate) {
addCollationsToResponse(params, spellingResult, rb, q, suggestions);
}
NamedList response = new SimpleOrderedMap();
response.add("suggestions", suggestions);
rb.rsp.add("spellcheck", response);
} else {
throw new SolrException(SolrException.ErrorCode.NOT_FOUND,
@ -249,6 +271,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
boolean collationExtendedResults = params.getBool(SPELLCHECK_COLLATE_EXTENDED_RESULTS, false);
int maxCollationTries = params.getInt(SPELLCHECK_MAX_COLLATION_TRIES, 0);
int maxCollations = params.getInt(SPELLCHECK_MAX_COLLATIONS, 1);
Integer maxResultsForSuggest = params.getInt(SpellingParams.SPELLCHECK_MAX_RESULTS_FOR_SUGGEST);
int count = rb.req.getParams().getInt(SPELLCHECK_COUNT, 1);
int numSug = Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
@ -258,17 +281,22 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
if (origQuery == null) {
origQuery = params.get(CommonParams.Q);
}
}
}
SpellCheckMergeData mergeData = new SpellCheckMergeData();
for (ShardRequest sreq : rb.finished) {
for (ShardResponse srsp : sreq.responses) {
NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
LOG.info(srsp.getShard() + " " + nl);
if (nl != null) {
mergeData.totalNumberShardResponses++;
collectShardSuggestions(nl, mergeData);
collectShardCollations(mergeData, nl, maxCollationTries);
long hits = rb.getNumberDocumentsFound();
boolean isCorrectlySpelled = hits > (maxResultsForSuggest==null ? 0 : maxResultsForSuggest);
SpellCheckMergeData mergeData = new SpellCheckMergeData();
if (maxResultsForSuggest==null || !isCorrectlySpelled) {
for (ShardRequest sreq : rb.finished) {
for (ShardResponse srsp : sreq.responses) {
NamedList nl = (NamedList) srsp.getSolrResponse().getResponse().get("spellcheck");
LOG.info(srsp.getShard() + " " + nl);
if (nl != null) {
mergeData.totalNumberShardResponses++;
collectShardSuggestions(nl, mergeData);
collectShardCollations(mergeData, nl, maxCollationTries);
}
}
}
}
@ -279,10 +307,12 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
SpellingResult result = checker.mergeSuggestions(mergeData, numSug, count, extendedResults);
NamedList response = new SimpleOrderedMap();
NamedList suggestions = toNamedList(false, result, origQuery, extendedResults, collate);
if (collate) {
SpellCheckCollation[] sortedCollations = mergeData.collations.values().toArray(new SpellCheckCollation[mergeData.collations.size()]);
Arrays.sort(sortedCollations);
NamedList suggestions = toNamedList(false, result, origQuery,
extendedResults, collate, isCorrectlySpelled);
if (collate) {
SpellCheckCollation[] sortedCollations = mergeData.collations.values()
.toArray(new SpellCheckCollation[mergeData.collations.size()]);
Arrays.sort(sortedCollations);
int i = 0;
while (i < maxCollations && i < sortedCollations.length) {
SpellCheckCollation collation = sortedCollations[i];
@ -341,7 +371,8 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
}
sug.string = alternative;
// alternative frequency is present only for extendedResults=true
if (suggestion.getAlternativeFrequencies() != null && suggestion.getAlternativeFrequencies().size() > 0) {
if (suggestion.getAlternativeFrequencies() != null
&& suggestion.getAlternativeFrequencies().size() > 0) {
Integer freq = suggestion.getAlternativeFrequencies().get(i);
if (freq != null) sug.freq += freq;
}
@ -446,68 +477,74 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
return spellCheckers.get(name);
}
protected NamedList toNamedList(boolean shardRequest, SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate) {
protected NamedList toNamedList(boolean shardRequest,
SpellingResult spellingResult, String origQuery, boolean extendedResults,
boolean collate, boolean correctlySpelled) {
NamedList result = new NamedList();
Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
Map<Token,LinkedHashMap<String,Integer>> suggestions = spellingResult
.getSuggestions();
boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
boolean isCorrectlySpelled = false;
int numSuggestions = 0;
for(LinkedHashMap<String, Integer> theSuggestion : suggestions.values())
{
if(theSuggestion.size()>0)
{
numSuggestions++;
}
}
// will be flipped to false if any of the suggestions are not in the index and hasFreqInfo is true
if(numSuggestions > 0) {
isCorrectlySpelled = true;
}
for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
boolean hasSuggestions = false;
boolean hasZeroFrequencyToken = false;
for (Map.Entry<Token,LinkedHashMap<String,Integer>> entry : suggestions
.entrySet()) {
Token inputToken = entry.getKey();
Map<String, Integer> theSuggestions = entry.getValue();
if (theSuggestions != null && (theSuggestions.size()>0 || shardRequest)) {
String tokenString = new String(inputToken.buffer(), 0, inputToken
.length());
Map<String,Integer> theSuggestions = new LinkedHashMap<String,Integer>(
entry.getValue());
Iterator<String> sugIter = theSuggestions.keySet().iterator();
while (sugIter.hasNext()) {
String sug = sugIter.next();
if (sug.equals(tokenString)) {
sugIter.remove();
}
}
if (theSuggestions.size() > 0) {
hasSuggestions = true;
}
if (theSuggestions != null && (theSuggestions.size() > 0 || shardRequest)) {
SimpleOrderedMap suggestionList = new SimpleOrderedMap();
suggestionList.add("numFound", theSuggestions.size());
suggestionList.add("startOffset", inputToken.startOffset());
suggestionList.add("endOffset", inputToken.endOffset());
// Logical structure of normal (non-extended) results:
// "suggestion":["alt1","alt2"]
//
// Logical structure of the extended results:
// "suggestion":[
// {"word":"alt1","freq":7},
// {"word":"alt2","freq":4}
// {"word":"alt1","freq":7},
// {"word":"alt2","freq":4}
// ]
if (extendedResults && hasFreqInfo) {
suggestionList.add("origFreq", spellingResult.getTokenFrequency(inputToken));
suggestionList.add("origFreq", spellingResult
.getTokenFrequency(inputToken));
ArrayList<SimpleOrderedMap> sugs = new ArrayList<SimpleOrderedMap>();
suggestionList.add("suggestion", sugs);
for (Map.Entry<String, Integer> suggEntry : theSuggestions.entrySet()) {
for (Map.Entry<String,Integer> suggEntry : theSuggestions.entrySet()) {
SimpleOrderedMap sugEntry = new SimpleOrderedMap();
sugEntry.add("word",suggEntry.getKey());
sugEntry.add("freq",suggEntry.getValue());
sugEntry.add("word", suggEntry.getKey());
sugEntry.add("freq", suggEntry.getValue());
sugs.add(sugEntry);
}
} else {
suggestionList.add("suggestion", theSuggestions.keySet());
}
if (hasFreqInfo) {
isCorrectlySpelled = isCorrectlySpelled && spellingResult.getTokenFrequency(inputToken) > 0;
int tokenFrequency = spellingResult.getTokenFrequency(inputToken);
if (tokenFrequency == 0) {
hasZeroFrequencyToken = true;
}
}
result.add(new String(inputToken.buffer(), 0, inputToken.length()), suggestionList);
result.add(tokenString, suggestionList);
}
}
if (hasFreqInfo) {
result.add("correctlySpelled", isCorrectlySpelled);
} else if(extendedResults && suggestions.size() == 0) { // if the word is misspelled, its added to suggestions with freqinfo
result.add("correctlySpelled", true);
if (extendedResults) {
result.add("correctlySpelled", correctlySpelled);
}
return result;
}

View File

@ -44,6 +44,7 @@ import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.FieldType;
@ -147,22 +148,42 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy;
int count = Math.max(options.count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
SuggestMode mode = options.onlyMorePopular ? SuggestMode.SUGGEST_MORE_POPULAR : SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
for (Token token : options.tokens) {
String tokenText = new String(token.buffer(), 0, token.length());
term = new Term(field, tokenText);
int docFreq = 0;
if (reader != null) {
docFreq = reader.docFreq(term);
}
String[] suggestions = spellChecker.suggestSimilar(tokenText,
count,
field != null ? reader : null, //workaround LUCENE-1295
field,
mode, theAccuracy);
if (suggestions.length == 1 && suggestions[0].equals(tokenText)) {
//These are spelled the same, continue on
((options.alternativeTermCount == null || docFreq == 0) ? count
: options.alternativeTermCount), field != null ? reader : null, // workaround LUCENE-1295
field, options.suggestMode, theAccuracy);
if (suggestions.length == 1 && suggestions[0].equals(tokenText)
&& options.alternativeTermCount == null) {
// These are spelled the same, continue on
continue;
}
// If considering alternatives to "correctly-spelled" terms, then add the
// original as a viable suggestion.
if (options.alternativeTermCount != null && docFreq > 0) {
boolean foundOriginal = false;
String[] suggestionsWithOrig = new String[suggestions.length + 1];
for (int i = 0; i < suggestions.length; i++) {
if (suggestions[i].equals(tokenText)) {
foundOriginal = true;
break;
}
suggestionsWithOrig[i + 1] = suggestions[i];
}
if (!foundOriginal) {
suggestionsWithOrig[0] = tokenText;
suggestions = suggestionsWithOrig;
}
}
if (options.extendedResults == true && reader != null && field != null) {
term = new Term(field, tokenText);
result.addFrequency(token, reader.docFreq(term));
result.addFrequency(token, docFreq);
int countLimit = Math.min(options.count, suggestions.length);
if(countLimit>0)
{

View File

@ -181,21 +181,36 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
SpellingResult result = new SpellingResult();
float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;
SuggestMode mode = options.onlyMorePopular ? SuggestMode.SUGGEST_MORE_POPULAR : SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
for (Token token : options.tokens) {
Term term = new Term(field, token.toString());
SuggestWord[] suggestions = checker.suggestSimilar(term,
options.count, options.reader, mode, accuracy);
int docFreq = 0;
if(options.extendedResults || suggestions.length==0) {
docFreq = options.reader.docFreq(term);
}
if(options.extendedResults) {
result.addFrequency(token, docFreq);
}
if(suggestions.length==0 && docFreq==0) {
String tokenText = token.toString();
Term term = new Term(field, tokenText);
int freq = options.reader.docFreq(term);
int count = (options.alternativeTermCount != null && freq > 0) ? options.alternativeTermCount: options.count;
SuggestWord[] suggestions = checker.suggestSimilar(term, count,options.reader, options.suggestMode, accuracy);
result.addFrequency(token, freq);
// If considering alternatives to "correctly-spelled" terms, then add the
// original as a viable suggestion.
if (options.alternativeTermCount != null && freq > 0) {
boolean foundOriginal = false;
SuggestWord[] suggestionsWithOrig = new SuggestWord[suggestions.length + 1];
for (int i = 0; i < suggestions.length; i++) {
if (suggestions[i].string.equals(tokenText)) {
foundOriginal = true;
break;
}
suggestionsWithOrig[i + 1] = suggestions[i];
}
if (!foundOriginal) {
SuggestWord orig = new SuggestWord();
orig.freq = freq;
orig.string = tokenText;
suggestionsWithOrig[0] = orig;
suggestions = suggestionsWithOrig;
}
}
if(suggestions.length==0 && freq==0) {
List<String> empty = Collections.emptyList();
result.add(token, empty);
} else {

View File

@ -97,6 +97,9 @@ public class PossibilityIterator implements Iterator<RankedSpellPossibility> {
if(rankedPossibilities.size() >= maximumRequiredSuggestions && rsp.getRank() >= rankedPossibilities.peek().getRank()) {
continue;
}
if (!isSuggestionForReal(rsp)) {
continue;
}
rankedPossibilities.offer(rsp);
if(rankedPossibilities.size() > maximumRequiredSuggestions) {
rankedPossibilities.poll();
@ -109,6 +112,15 @@ public class PossibilityIterator implements Iterator<RankedSpellPossibility> {
}
rankedPossibilityIterator = Arrays.asList(rpArr).iterator();
}
private boolean isSuggestionForReal(RankedSpellPossibility rsp) {
for (SpellCheckCorrection corr : rsp.getCorrections()) {
if (!corr.getOriginalAsString().equals(corr.getCorrection())) {
return true;
}
}
return false;
}
private boolean internalHasNext() {
return !done;

View File

@ -20,15 +20,24 @@ import org.apache.lucene.analysis.Token;
public class SpellCheckCorrection {
private Token original;
private String originalAsString = null;
private String correction;
private int numberOfOccurences;
public Token getOriginal() {
return original;
}
public String getOriginalAsString() {
if (originalAsString == null && original != null) {
originalAsString = original.toString();
}
return originalAsString;
}
public void setOriginal(Token original) {
this.original = original;
this.originalAsString = null;
}
public String getCorrection() {

View File

@ -2,9 +2,11 @@ package org.apache.solr.spelling;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.solr.common.params.SolrParams;
import java.util.Collection;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -22,13 +24,12 @@ import java.util.Collection;
* limitations under the License.
*/
/**
*
*
**/
public class SpellingOptions {
/**
* The tokens to spell check
*/
@ -38,55 +39,69 @@ public class SpellingOptions {
*/
public IndexReader reader;
/**
* The number of suggestions to return, if there are any. Defaults to 1.
* The number of suggestions to return, if there are any. Defaults to 1.
*/
public int count = 1;
/**
* Return only those results that are more popular, as defined by the implementation
*/
public boolean onlyMorePopular;
public Integer alternativeTermCount = null;
public SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
/**
* Provide additional, per implementation, information about the results
*/
public boolean extendedResults;
/**
* Optionally restrict the results to have a minimum accuracy level. Per Implementation.
* By default set to Float.MIN_VALUE.
* Optionally restrict the results to have a minimum accuracy level. Per
* Implementation. By default set to Float.MIN_VALUE.
*/
public float accuracy = Float.MIN_VALUE;
/**
* Any other custom params can be passed through. May be null and is null by default.
* Any other custom params can be passed through. May be null and is null by
* default.
*/
public SolrParams customParams;
public SpellingOptions() {
}
//A couple of convenience ones
public SpellingOptions() {}
// A couple of convenience ones
public SpellingOptions(Collection<Token> tokens, int count) {
this.tokens = tokens;
this.count = count;
}
public SpellingOptions(Collection<Token> tokens, IndexReader reader) {
this.tokens = tokens;
this.reader = reader;
}
public SpellingOptions(Collection<Token> tokens, IndexReader reader, int count) {
this.tokens = tokens;
this.reader = reader;
this.count = count;
}
public SpellingOptions(Collection<Token> tokens, IndexReader reader, int count, boolean onlyMorePopular, boolean extendedResults, float accuracy, SolrParams customParams) {
public SpellingOptions(Collection<Token> tokens, IndexReader reader,
int count, SuggestMode suggestMode, boolean extendedResults,
float accuracy, SolrParams customParams) {
this.tokens = tokens;
this.reader = reader;
this.count = count;
this.onlyMorePopular = onlyMorePopular;
this.suggestMode = suggestMode;
this.extendedResults = extendedResults;
this.accuracy = accuracy;
this.customParams = customParams;
}
public SpellingOptions(Collection<Token> tokens, IndexReader reader,
int count, Integer alternativeTermCount, SuggestMode suggestMode,
boolean extendedResults, float accuracy, SolrParams customParams) {
this.tokens = tokens;
this.reader = reader;
this.count = count;
this.alternativeTermCount = alternativeTermCount;
this.suggestMode = suggestMode;
this.extendedResults = extendedResults;
this.accuracy = accuracy;
this.customParams = customParams;

View File

@ -30,6 +30,7 @@ import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.HighFrequencyDictionary;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.search.suggest.FileDictionary;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
@ -185,11 +186,11 @@ public class Suggester extends SolrSpellChecker {
scratch.offset = 0;
scratch.length = t.length();
List<LookupResult> suggestions = lookup.lookup(scratch,
options.onlyMorePopular, options.count);
(options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR), options.count);
if (suggestions == null) {
continue;
}
if (!options.onlyMorePopular) {
if (options.suggestMode != SuggestMode.SUGGEST_MORE_POPULAR) {
Collections.sort(suggestions);
}
for (LookupResult lr : suggestions) {

View File

@ -49,12 +49,26 @@ Config for testing spellcheck component
<str name="spellcheckIndexDir">spellchecker1</str>
<str name="buildOnCommit">true</str>
</lst>
<lst name="spellchecker">
<str name="name">default_teststop</str>
<str name="spellcheckIndexDir">default_teststop</str>
<str name="buildOnCommit">true</str>
<str name="field">teststop</str>
</lst>
<lst name="spellchecker">
<str name="name">direct</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
<int name="minQueryLength">3</int>
<float name="maxQueryFrequency">100</float>
<str name="field">teststop</str>
</lst>
<lst name="spellchecker">
<str name="name">direct_lowerfilt</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
<int name="minQueryLength">3</int>
<float name="maxQueryFrequency">100</float>
<str name="field">lowerfilt</str>
</lst>
<lst name="spellchecker">
<str name="name">threshold</str>
<str name="field">lowerfilt</str>

View File

@ -118,6 +118,7 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
index(id, "22", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "23", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "24", "lowerfilt", "The quote red fox jumped over the lazy brown dogs.");
index(id, "25", "lowerfilt", "rod fix");
commit();
handle.clear();
@ -138,5 +139,7 @@ public class DistributedSpellCheckComponentTest extends BaseDistributedSearchTes
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellingParams.SPELLCHECK_EXTENDED_RESULTS, "true", SpellingParams.SPELLCHECK_COUNT, "10", SpellingParams.SPELLCHECK_COLLATE, "true", SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellingParams.SPELLCHECK_MAX_COLLATIONS, "10", SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
query("q", "lowerfilt:(+quock +reb)", "fl", "id,lowerfilt", "spellcheck", "true", "qt", requestHandlerName, "shards.qt", requestHandlerName, SpellingParams.SPELLCHECK_EXTENDED_RESULTS, "true", SpellingParams.SPELLCHECK_COUNT, "10", SpellingParams.SPELLCHECK_COLLATE, "true", SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, "0", SpellingParams.SPELLCHECK_MAX_COLLATIONS, "1", SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "false");
query("q", "lowerfilt:(\"quote red fox\")", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_ALTERNATIVE_TERM_COUNT, "5", SpellCheckComponent.SPELLCHECK_MAX_RESULTS_FOR_SUGGEST, "10");
query("q", "lowerfilt:(\"rod fix\")", "fl", "id,lowerfilt", "spellcheck", "true", "qt", "spellCheckCompRH", "shards.qt", "spellCheckCompRH", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_COUNT, "10", SpellCheckComponent.SPELLCHECK_COLLATE, "true", SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10", SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1", SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true", SpellCheckComponent.SPELLCHECK_ALTERNATIVE_TERM_COUNT, "5", SpellCheckComponent.SPELLCHECK_MAX_RESULTS_FOR_SUGGEST, "10");
}
}

View File

@ -124,10 +124,16 @@ public class SpellCheckComponentTest extends SolrTestCaseJ4 {
@Test
public void testCorrectSpelling() throws Exception {
// Make sure correct spellings are signaled in the response
assertJQ(req("json.nl","map", "qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","lowerfilt:lazy lowerfilt:brown", SpellingParams.SPELLCHECK_EXTENDED_RESULTS, "true")
assertJQ(req("json.nl","map", "qt",rh, SpellCheckComponent.COMPONENT_NAME, "true",
"q","lowerfilt:lazy lowerfilt:brown", SpellingParams.SPELLCHECK_EXTENDED_RESULTS, "true")
,"/spellcheck/suggestions=={'correctlySpelled':true}"
);
assertJQ(req("json.nl","map", "qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","lakkle", SpellingParams.SPELLCHECK_EXTENDED_RESULTS, "true")
assertJQ(req("json.nl","map", "qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "spellcheck.dictionary", "direct_lowerfilt",
"q","lowerfilt:lazy lowerfilt:brown", SpellingParams.SPELLCHECK_EXTENDED_RESULTS, "true")
,"/spellcheck/suggestions=={'correctlySpelled':true}"
);
assertJQ(req("json.nl","map", "qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "spellcheck.dictionary", "direct_lowerfilt",
"q","lakkle", SpellingParams.SPELLCHECK_EXTENDED_RESULTS, "true")
,"/spellcheck/suggestions/correctlySpelled==false"
);
}

View File

@ -27,6 +27,7 @@ import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
import org.apache.lucene.store.Directory;
@ -199,7 +200,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 {
IndexReader reader = searcher.getIndexReader();
Collection<Token> tokens = queryConverter.convert("documemt");
SpellingOptions spellOpts = new SpellingOptions(tokens, reader, 1, false, true, 0.5f, null);
SpellingOptions spellOpts = new SpellingOptions(tokens, reader, 1, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, true, 0.5f, null);
SpellingResult result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null);
//should be lowercased, b/c we are using a lowercasing analyzer
@ -312,7 +313,7 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 {
IndexReader reader = searcher.getIndexReader();
Collection<Token> tokens = queryConverter.convert("flesh");
SpellingOptions spellOpts = new SpellingOptions(tokens, reader, 1, false, true, 0.5f, null);
SpellingOptions spellOpts = new SpellingOptions(tokens, reader, 1, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, true, 0.5f, null);
SpellingResult result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null);
//should be lowercased, b/c we are using a lowercasing analyzer

View File

@ -40,7 +40,7 @@ import org.junit.Test;
public class SpellCheckCollatorTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml", "schema.xml");
initCore("solrconfig-spellcheckcomponent.xml", "schema.xml");
assertNull(h.validateUpdate(adoc("id", "0", "lowerfilt", "faith hope and love")));
assertNull(h.validateUpdate(adoc("id", "1", "lowerfilt", "faith hope and loaves")));
assertNull(h.validateUpdate(adoc("id", "2", "lowerfilt", "fat hops and loaves")));
@ -48,6 +48,12 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 {
assertNull(h.validateUpdate(adoc("id", "4", "lowerfilt", "fat of homer")));
assertNull(h.validateUpdate(adoc("id", "5", "lowerfilt1", "peace")));
assertNull(h.validateUpdate(adoc("id", "6", "lowerfilt", "hyphenated word")));
assertNull(h.validateUpdate(adoc("id", "7", "teststop", "Jane filled out a form at Charles De Gaulle")));
assertNull(h.validateUpdate(adoc("id", "8", "teststop", "Dick flew from Heathrow")));
assertNull(h.validateUpdate(adoc("id", "9", "teststop", "Jane is stuck in customs because Spot chewed up the form")));
assertNull(h.validateUpdate(adoc("id", "10", "teststop", "Once in Paris Dick built a fire on the hearth")));
assertNull(h.validateUpdate(adoc("id", "11", "teststop", "Dick waited for Jane as he watched the sparks flow upward")));
assertNull(h.validateUpdate(adoc("id", "12", "teststop", "This June parisian rendez-vous is ruined because of a customs snafu")));
assertNull(h.validateUpdate(commit()));
}
@ -324,4 +330,62 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 {
List<String> collations = suggestions.getAll("collation");
assertTrue(collations.size() == 1);
}
@Test
public void testContextSensitiveCollate() throws Exception {
// DirectSolrSpellChecker IndexBasedSpellChecker
String[] dictionary = {"direct", "default_teststop" };
for(int i=0 ; i<1 ; i++) {
assertQ(
req(
"q", "teststop:(flew AND form AND heathrow)",
"qt", "spellCheckCompRH",
"indent", "true",
SpellCheckComponent.COMPONENT_NAME, "true",
SpellCheckComponent.SPELLCHECK_DICT, dictionary[i],
SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true",
SpellCheckComponent.SPELLCHECK_COUNT, "10",
SpellCheckComponent.SPELLCHECK_ALTERNATIVE_TERM_COUNT, "5",
SpellCheckComponent.SPELLCHECK_MAX_RESULTS_FOR_SUGGEST, "0",
SpellCheckComponent.SPELLCHECK_COLLATE, "true",
SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10",
SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1",
SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true"
),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='flew']/arr[@name='suggestion']/lst/str[@name='word']='flow'",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='form']/arr[@name='suggestion']/lst/str[@name='word']='from'",
/* DirectSolrSpellChecker won't suggest if the edit distance > 2, so we can't test for this one...
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='heathrow']/arr[@name='suggestion']/lst/str[@name='word']='hearth'",
*/
"//lst[@name='spellcheck']/lst[@name='suggestions']/bool[@name='correctlySpelled']='false'",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/str[@name='collationQuery']='teststop:(flew AND from AND heathrow)'",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/int[@name='hits']=1",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/lst[@name='misspellingsAndCorrections']/str[@name='form']='from'"
);
assertQ(
req(
"q", "teststop:(june AND customs)",
"qt", "spellCheckCompRH",
"indent", "true",
SpellCheckComponent.COMPONENT_NAME, "true",
SpellCheckComponent.SPELLCHECK_DICT, dictionary[i],
SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true",
SpellCheckComponent.SPELLCHECK_COUNT, "10",
SpellCheckComponent.SPELLCHECK_ALTERNATIVE_TERM_COUNT, "5",
SpellCheckComponent.SPELLCHECK_MAX_RESULTS_FOR_SUGGEST, "1",
SpellCheckComponent.SPELLCHECK_COLLATE, "true",
SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10",
SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "1",
SpellCheckComponent.SPELLCHECK_COLLATE_EXTENDED_RESULTS, "true"
),
"//result[@numFound=1]",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='june']/arr[@name='suggestion']/lst/str[@name='word']='jane'",
"//lst[@name='spellcheck']/lst[@name='suggestions']/bool[@name='correctlySpelled']='false'",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/str[@name='collationQuery']='teststop:(jane AND customs)'",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/int[@name='hits']=1",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='collation']/lst[@name='misspellingsAndCorrections']/str[@name='june']='jane'"
);
}
}
}

View File

@ -33,14 +33,39 @@ public interface SpellingParams {
public static final String SPELLCHECK_DICT = SPELLCHECK_PREFIX + "dictionary";
/**
* The count of suggestions needed for a given query.
* The count of suggestions to return for each query term not in the index and/or dictionary.
* <p/>
* If this parameter is absent in the request then only one suggestion is
* returned. If it is more than one then a maximum of given suggestions are
* returned for each token in the query.
*/
public static final String SPELLCHECK_COUNT = SPELLCHECK_PREFIX + "count";
/**
* The count of suggestions to return for each query term existing in the index and/or dictionary.
* <p/>
* If this parameter is absent in the request then no suggestions are generated. This parameter allows
* for receiving alternative terms to use in context-sensitive spelling corrections.
*/
public static final String SPELLCHECK_ALTERNATIVE_TERM_COUNT = SPELLCHECK_PREFIX + "alternativeTermCount";
/**
* <p>
* The maximum number of hits the request can return in order to both
* generate spelling suggestions and set the "correctlySpelled" element to "false".
* Note that this parameter is typically of use only in conjunction with "spellcheck.alternativeTermCount".
* </p>
* <p>
* If left unspecified, the default behavior will prevail. That is, "correctlySpelled" will be false and suggestions
* will be returned only if one or more of the query terms are absent from the dictionary and/or index. If set to zero,
* the "correctlySpelled" flag will be false only if the response returns zero hits. If set to a value greater than zero,
* suggestions will be returned even if hits are returned (up to the specified number). This number also will serve as
* the threshold in determining the value of "correctlySpelled". Specifying a value greater than zero is useful
* for creating "did-you-mean" suggestions for queries that return a low number of hits.
* </p>
*/
public static final String SPELLCHECK_MAX_RESULTS_FOR_SUGGEST = SPELLCHECK_PREFIX + "maxResultsForSuggest";
/**
* When this parameter is set to true and the misspelled word exists in the
* user field, only words that occur more frequently in the Solr field than
@ -49,7 +74,7 @@ public interface SpellingParams {
* <b>This is applicable only for dictionaries built from Solr fields.</b>
*/
public static final String SPELLCHECK_ONLY_MORE_POPULAR = SPELLCHECK_PREFIX + "onlyMorePopular";
/**
* Whether to use the extended response format, which is more complicated but
* richer. Returns the document frequency for each suggestion and returns one