From 1620366b5ca652fce4f7a97d0c9b567828886782 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Thu, 19 Aug 2010 15:01:29 +0000 Subject: [PATCH] LUCENE-2608: Add per-method and request accuracy to spell checker git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@987179 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 3 + .../lucene/search/spell/SpellChecker.java | 150 +++++++++++++----- .../lucene/search/spell/TestSpellChecker.java | 12 +- solr/CHANGES.txt | 11 +- .../solr/common/params/SpellingParams.java | 5 + .../component/SpellCheckComponent.java | 36 ++++- .../spelling/AbstractLuceneSpellChecker.java | 29 ++-- .../solr/spelling/SolrSpellChecker.java | 41 +---- .../apache/solr/spelling/SpellingOptions.java | 94 +++++++++++ .../DummyCustomParamSpellChecker.java | 61 +++++++ .../component/SpellCheckComponentTest.java | 50 +++++- .../spelling/FileBasedSpellCheckerTest.java | 24 +-- .../spelling/IndexBasedSpellCheckerTest.java | 70 ++++---- .../test/test-files/solr/conf/solrconfig.xml | 6 +- 14 files changed, 446 insertions(+), 146 deletions(-) create mode 100644 solr/src/java/org/apache/solr/spelling/SpellingOptions.java create mode 100644 solr/src/test/org/apache/solr/handler/component/DummyCustomParamSpellChecker.java diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index a989e0d625f..010a2e5eacd 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -23,6 +23,9 @@ New Features * LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll) + * LUCENE-2608: Added the ability to specify the accuracy at method time in the SpellChecker. The per class + method is also still available. (Grant Ingersoll) + API Changes * LUCENE-2606: Changed RegexCapabilities interface to fix thread diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java index 9c9dac18cf1..485dd8a1ce5 100755 --- a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java +++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java @@ -62,11 +62,16 @@ import org.apache.lucene.util.Version; */ public class SpellChecker implements java.io.Closeable { + /** + * The default minimum score to use, if not specified by calling {@link #setAccuracy(float)} . + */ + public static final float DEFAULT_ACCURACY = 0.5f; + /** * Field name for each word in the ngram index. */ public static final String F_WORD = "word"; - + private static final Term F_WORD_TERM = new Term(F_WORD); /** @@ -75,35 +80,34 @@ public class SpellChecker implements java.io.Closeable { // don't modify the directory directly - see #swapSearcher() // TODO: why is this package private? Directory spellIndex; - /** * Boost value for start and end grams */ private float bStart = 2.0f; - private float bEnd = 1.0f; + private float bEnd = 1.0f; // don't use this searcher directly - see #swapSearcher() + private IndexSearcher searcher; - /* - * this locks all modifications to the current searcher. + * this locks all modifications to the current searcher. */ + private final Object searcherLock = new Object(); - /* - * this lock synchronizes all possible modifications to the + * this lock synchronizes all possible modifications to the * current index directory. It should not be possible to try modifying * the same index concurrently. Note: Do not acquire the searcher lock - * before acquiring this lock! + * before acquiring this lock! */ private final Object modifyCurrentIndexLock = new Object(); + private volatile boolean closed = false; - // minimum score for hits generated by the spell checker query - private float minScore = 0.5f; - - private StringDistance sd; + private float accuracy = DEFAULT_ACCURACY; + + private StringDistance sd; private Comparator comparator; /** @@ -202,10 +206,20 @@ public class SpellChecker implements java.io.Closeable { } /** - * Sets the accuracy 0 < minScore < 1; default 0.5 + * Sets the accuracy 0 < minScore < 1; default {@link #DEFAULT_ACCURACY} + * @param acc The new accuracy */ - public void setAccuracy(float minScore) { - this.minScore = minScore; + public void setAccuracy(float acc) { + this.accuracy = acc; + } + + /** + * The accuracy (minimum score) to be used, unless overridden in {@link #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)}, to + * decide whether a suggestion is included or not. + * @return The current accuracy setting + */ + public float getAccuracy() { + return accuracy; } /** @@ -224,11 +238,37 @@ public class SpellChecker implements java.io.Closeable { * @throws IOException if the underlying index throws an {@link IOException} * @throws AlreadyClosedException if the Spellchecker is already closed * @return String[] + * + * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) */ public String[] suggestSimilar(String word, int numSug) throws IOException { return this.suggestSimilar(word, numSug, null, null, false); } + /** + * Suggest similar words. + * + *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms + * is not the same as the edit distance strategy used to calculate the best + * matching spell-checked word from the hits that Lucene found, one usually has + * to retrieve a couple of numSug's in order to get the true best match. + * + *

I.e. if numSug == 1, don't count on that suggestion being the best one. + * Thus, you should set this value to at least 5 for a good suggestion. + * + * @param word the word you want a spell check done on + * @param numSug the number of suggested words + * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results + * @throws IOException if the underlying index throws an {@link IOException} + * @throws AlreadyClosedException if the Spellchecker is already closed + * @return String[] + * + * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) + */ + public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException { + return this.suggestSimilar(word, numSug, null, null, false, accuracy); + } + /** * Suggest similar words (optionally restricted to a field of an index). * @@ -240,6 +280,40 @@ public class SpellChecker implements java.io.Closeable { *

I.e. if numSug == 1, don't count on that suggestion being the best one. * Thus, you should set this value to at least 5 for a good suggestion. * + *

Uses the {@link #getAccuracy()} value passed into the constructor as the accuracy. + * + * @param word the word you want a spell check done on + * @param numSug the number of suggested words + * @param ir the indexReader of the user index (can be null see field param) + * @param field the field of the user index: if field is not null, the suggested + * words are restricted to the words present in this field. + * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word + * (only if restricted mode = (indexReader!=null and field!=null) + * @throws IOException if the underlying index throws an {@link IOException} + * @throws AlreadyClosedException if the Spellchecker is already closed + * @return String[] the sorted list of the suggest words with these 2 criteria: + * first criteria: the edit distance, second criteria (only if restricted mode): the popularity + * of the suggest words in the field of the user index + * + * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) + */ + public String[] suggestSimilar(String word, int numSug, IndexReader ir, + String field, boolean morePopular) throws IOException { + return suggestSimilar(word, numSug, ir, field, morePopular, accuracy); + } + + + /** + * Suggest similar words (optionally restricted to a field of an index). + * + *

As the Lucene similarity that is used to fetch the most relevant n-grammed terms + * is not the same as the edit distance strategy used to calculate the best + * matching spell-checked word from the hits that Lucene found, one usually has + * to retrieve a couple of numSug's in order to get the true best match. + * + *

I.e. if numSug == 1, don't count on that suggestion being the best one. + * Thus, you should set this value to at least 5 for a good suggestion. + * * @param word the word you want a spell check done on * @param numSug the number of suggested words * @param ir the indexReader of the user index (can be null see field param) @@ -247,6 +321,7 @@ public class SpellChecker implements java.io.Closeable { * words are restricted to the words present in this field. * @param morePopular return only the suggest words that are as frequent or more frequent than the searched word * (only if restricted mode = (indexReader!=null and field!=null) + * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results * @throws IOException if the underlying index throws an {@link IOException} * @throws AlreadyClosedException if the Spellchecker is already closed * @return String[] the sorted list of the suggest words with these 2 criteria: @@ -254,72 +329,72 @@ public class SpellChecker implements java.io.Closeable { * of the suggest words in the field of the user index */ public String[] suggestSimilar(String word, int numSug, IndexReader ir, - String field, boolean morePopular) throws IOException { + String field, boolean morePopular, float accuracy) throws IOException { // obtainSearcher calls ensureOpen final IndexSearcher indexSearcher = obtainSearcher(); try{ - float min = this.minScore; + final int lengthWord = word.length(); - + final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0; final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0; // if the word exists in the real index and we don't care for word frequency, return the word itself if (!morePopular && freq > 0) { return new String[] { word }; } - + BooleanQuery query = new BooleanQuery(); String[] grams; String key; - + for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) { - + key = "gram" + ng; // form key - + grams = formGrams(word, ng); // form word into ngrams (allow dups too) - + if (grams.length == 0) { continue; // hmm } - + if (bStart > 0) { // should we boost prefixes? add(query, "start" + ng, grams[0], bStart); // matches start of word - + } if (bEnd > 0) { // should we boost suffixes add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word - + } for (int i = 0; i < grams.length; i++) { add(query, key, grams[i]); } } - + int maxHits = 10 * numSug; - + // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs; // System.out.println("HITS: " + hits.length()); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator); - + // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.min(hits.length, maxHits); SuggestWord sugWord = new SuggestWord(); for (int i = 0; i < stop; i++) { - + sugWord.string = indexSearcher.doc(hits[i].doc).get(F_WORD); // get orig word - + // don't suggest a word for itself, that would be silly if (sugWord.string.equals(word)) { continue; } - + // edit distance sugWord.score = sd.getDistance(word,sugWord.string); - if (sugWord.score < min) { + if (sugWord.score < accuracy) { continue; } - + if (ir != null && field != null) { // use the user index sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index // don't suggest a word that is not present in the field @@ -330,23 +405,22 @@ public class SpellChecker implements java.io.Closeable { sugQueue.insertWithOverflow(sugWord); if (sugQueue.size() == numSug) { // if queue full, maintain the minScore score - min = sugQueue.top().score; + accuracy = sugQueue.top().score; } sugWord = new SuggestWord(); } - + // convert to array string String[] list = new String[sugQueue.size()]; for (int i = sugQueue.size() - 1; i >= 0; i--) { list[i] = sugQueue.pop().string; } - + return list; } finally { releaseSearcher(indexSearcher); } } - /** * Add a clause to a boolean query. */ diff --git a/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java b/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java index 0f0a053c3a5..888c2c0564d 100755 --- a/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java +++ b/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java @@ -104,11 +104,21 @@ public class TestSpellChecker extends LuceneTestCase { spellChecker.setAccuracy(0.8f); checkCommonSuggestions(r); checkJaroWinklerSuggestions(); + // the accuracy is set to 0.8 by default, but the best result has a score of 0.925 + String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f); + assertTrue(similar.length == 0); + similar = spellChecker.suggestSimilar("fvie", 2, 0.92f); + assertTrue(similar.length == 1); + + similar = spellChecker.suggestSimilar("fiv", 2); + assertTrue(similar.length > 0); + assertEquals(similar[0], "five"); spellChecker.setStringDistance(new NGramDistance(2)); spellChecker.setAccuracy(0.5f); checkCommonSuggestions(r); checkNGramSuggestions(); + r.close(); } @@ -127,8 +137,6 @@ public class TestSpellChecker extends LuceneTestCase { if (!compareSP.isClosed()) compareSP.close(); compIdx.close(); - - } private void checkCommonSuggestions(IndexReader r) throws IOException { diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 4407ba49875..aeb7116d2b4 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -77,6 +77,12 @@ Upgrading from Solr 1.4 legacy behavior should set a default value for the 'mm' param in their solrconfig.xml file. +* LUCENE-2608: Added the ability to specify the accuracy on a per request basis. + Implementations of SolrSpellChecker must change over to the new SolrSpellChecker + abstract methods using the new SpellingOptions class. While this change is not + backward compatible, it should be trivial to migrate as the SpellingOptions class + just encapsulates the parameters that were passed in to the methods before the change. (gsingers) + Detailed Change List ---------------------- @@ -226,11 +232,6 @@ New Features * SOLR-2053: Add support for custom comparators in Solr spellchecker, per LUCENE-2479 (gsingers) -* SOLR-2049: Add hl.multiValuedSeparatorChar for FastVectorHighlighter, per LUCENE-2603. (koji) - -* SOLR-1881: add a url-scheme config string to SearchHandler to specify alternate - URL prefixes for distributed search shard requests. (Sami Siren via yonik) - Optimizations ---------------------- diff --git a/solr/src/common/org/apache/solr/common/params/SpellingParams.java b/solr/src/common/org/apache/solr/common/params/SpellingParams.java index b0c77a05292..277a1018c8c 100644 --- a/solr/src/common/org/apache/solr/common/params/SpellingParams.java +++ b/solr/src/common/org/apache/solr/common/params/SpellingParams.java @@ -81,4 +81,9 @@ public interface SpellingParams { * Take the top suggestion for each token and create a new query from it */ public static final String SPELLCHECK_COLLATE = SPELLCHECK_PREFIX + "collate"; + + /** + * Certain spelling implementations may allow for an accuracy setting. + */ + public static final String SPELLCHECK_ACCURACY = SPELLCHECK_PREFIX + "accuracy"; } diff --git a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java index be5a7731486..4b0c8c4583f 100644 --- a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java +++ b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java @@ -23,11 +23,13 @@ import java.util.*; import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.search.spell.LevensteinDistance; +import org.apache.lucene.search.spell.SpellChecker; import org.apache.lucene.search.spell.StringDistance; import org.apache.lucene.search.spell.SuggestWord; import org.apache.lucene.search.spell.SuggestWordQueue; import org.apache.lucene.util.PriorityQueue; import org.apache.solr.client.solrj.response.SpellCheckResponse; +import org.apache.solr.common.params.ModifiableSolrParams; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -144,8 +146,12 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar NamedList response = new SimpleOrderedMap(); IndexReader reader = rb.req.getSearcher().getReader(); boolean collate = params.getBool(SPELLCHECK_COLLATE, false); - SpellingResult spellingResult = spellChecker.getSuggestions(tokens, - reader, count, onlyMorePopular, extendedResults); + float accuracy = params.getFloat(SPELLCHECK_ACCURACY, Float.MIN_VALUE); + SolrParams customParams = getCustomParams(getDictionaryName(params), params); + SpellingOptions options = new SpellingOptions(tokens, reader, count, onlyMorePopular, extendedResults, + accuracy, customParams); + + SpellingResult spellingResult = spellChecker.getSuggestions(options); if (spellingResult != null) { response.add("suggestions", toNamedList(spellingResult, q, extendedResults, collate)); @@ -159,6 +165,24 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar } } + /** + * For every param that is of the form "spellcheck.[dictionary name].XXXX=YYYY, add + * XXXX=YYYY as a param to the custom param list + * @param params The original SolrParams + * @return The new Params + */ + protected SolrParams getCustomParams(String dictionary, SolrParams params) { + ModifiableSolrParams result = new ModifiableSolrParams(); + Iterator iter = params.getParameterNamesIterator(); + String prefix = SpellingParams.SPELLCHECK_PREFIX + "." + dictionary + "."; + while (iter.hasNext()){ + String nxt = iter.next(); + if (nxt.startsWith(prefix)){ + result.add(nxt.substring(prefix.length()), params.getParams(nxt)); + } + } + return result; + } @Override @@ -341,13 +365,17 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar } protected SolrSpellChecker getSpellChecker(SolrParams params) { + return spellCheckers.get(getDictionaryName(params)); + } + + private String getDictionaryName(SolrParams params) { String dictName = params.get(SPELLCHECK_DICT); if (dictName == null) { dictName = SolrSpellChecker.DEFAULT_DICTIONARY_NAME; } - return spellCheckers.get(dictName); + return dictName; } - + /** * @return the spellchecker registered to a given name */ diff --git a/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java b/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java index 5a32029af6a..a76506cefb9 100644 --- a/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java +++ b/solr/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java @@ -150,29 +150,30 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { return name; } - @SuppressWarnings("unchecked") - public SpellingResult getSuggestions(Collection tokens, - IndexReader reader, int count, boolean onlyMorePopular, - boolean extendedResults) - throws IOException { - SpellingResult result = new SpellingResult(tokens); - reader = determineReader(reader); + @Override + public SpellingResult getSuggestions(SpellingOptions options) throws IOException { + SpellingResult result = new SpellingResult(options.tokens); + IndexReader reader = determineReader(options.reader); Term term = field != null ? new Term(field, "") : null; - for (Token token : tokens) { + float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy; + + int count = (int) Math.max(options.count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT); + for (Token token : options.tokens) { String tokenText = new String(token.buffer(), 0, token.length()); - String[] suggestions = spellChecker.suggestSimilar(tokenText, (int) Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT), + String[] suggestions = spellChecker.suggestSimilar(tokenText, + count, field != null ? reader : null, //workaround LUCENE-1295 field, - onlyMorePopular); + options.onlyMorePopular, theAccuracy); if (suggestions.length == 1 && suggestions[0].equals(tokenText)) { //These are spelled the same, continue on continue; } - if (extendedResults == true && reader != null && field != null) { + if (options.extendedResults == true && reader != null && field != null) { term = term.createTerm(tokenText); result.add(token, reader.docFreq(term)); - int countLimit = Math.min(count, suggestions.length); + int countLimit = Math.min(options.count, suggestions.length); for (int i = 0; i < countLimit; i++) { term = term.createTerm(suggestions[i]); result.add(token, suggestions[i], reader.docFreq(term)); @@ -180,8 +181,8 @@ public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker { } else { if (suggestions.length > 0) { List suggList = Arrays.asList(suggestions); - if (suggestions.length > count) { - suggList = suggList.subList(0, count); + if (suggestions.length > options.count) { + suggList = suggList.subList(0, options.count); } result.add(token, suggList); } diff --git a/solr/src/java/org/apache/solr/spelling/SolrSpellChecker.java b/solr/src/java/org/apache/solr/spelling/SolrSpellChecker.java index eebe5c1bb03..4e702086d2e 100644 --- a/solr/src/java/org/apache/solr/spelling/SolrSpellChecker.java +++ b/solr/src/java/org/apache/solr/spelling/SolrSpellChecker.java @@ -70,46 +70,15 @@ public abstract class SolrSpellChecker { */ public abstract void build(SolrCore core, SolrIndexSearcher searcher); - /** - * Assumes count = 1, onlyMorePopular = false, extendedResults = false - * - * @see #getSuggestions(Collection, org.apache.lucene.index.IndexReader, int, boolean, boolean) - */ - public SpellingResult getSuggestions(Collection tokens, IndexReader reader) throws IOException { - return getSuggestions(tokens, reader, 1, false, false); - } - - /** - * Assumes onlyMorePopular = false, extendedResults = false - * - * @see #getSuggestions(Collection, org.apache.lucene.index.IndexReader, int, boolean, boolean) - */ - public SpellingResult getSuggestions(Collection tokens, IndexReader reader, int count) throws IOException { - return getSuggestions(tokens, reader, count, false, false); - } - - - /** - * Assumes count = 1. - * - * @see #getSuggestions(Collection, org.apache.lucene.index.IndexReader, int, boolean, boolean) - */ - public SpellingResult getSuggestions(Collection tokens, IndexReader reader, boolean onlyMorePopular, boolean extendedResults) throws IOException { - return getSuggestions(tokens, reader, 1, onlyMorePopular, extendedResults); - } /** * Get suggestions for the given query. Tokenizes the query using a field appropriate Analyzer. * The {@link SpellingResult#getSuggestions()} suggestions must be ordered by best suggestion first. + *

* - * @param tokens The Tokens to be spell checked. - * @param reader The (optional) IndexReader. If there is not IndexReader, than extendedResults are not possible - * @param count The maximum number of suggestions to return - * @param onlyMorePopular TODO - * @param extendedResults TODO - * @throws IOException + * @param options The {@link SpellingOptions} to use + * @return The {@link SpellingResult} suggestions + * @throws IOException if there is an error producing suggestions */ - public abstract SpellingResult getSuggestions(Collection tokens, IndexReader reader, int count, - boolean onlyMorePopular, boolean extendedResults) - throws IOException; + public abstract SpellingResult getSuggestions(SpellingOptions options) throws IOException; } diff --git a/solr/src/java/org/apache/solr/spelling/SpellingOptions.java b/solr/src/java/org/apache/solr/spelling/SpellingOptions.java new file mode 100644 index 00000000000..4a0a19edfaa --- /dev/null +++ b/solr/src/java/org/apache/solr/spelling/SpellingOptions.java @@ -0,0 +1,94 @@ +package org.apache.solr.spelling; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.index.IndexReader; +import org.apache.solr.common.params.SolrParams; + +import java.util.Collection; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * + * + **/ +public class SpellingOptions { + + /** + * The tokens to spell check + */ + public Collection tokens; + /** + * An optional {@link org.apache.lucene.index.IndexReader} + */ + public IndexReader reader; + /** + * The number of suggestions to return, if there are any. Defaults to 1. + */ + public int count = 1; + /** + * Return only those results that are more popular, as defined by the implementation + */ + public boolean onlyMorePopular; + /** + * Provide additional, per implementation, information about the results + */ + public boolean extendedResults; + + /** + * Optionally restrict the results to have a minimum accuracy level. Per Implementation. + * By default set to Float.MIN_VALUE. + */ + public float accuracy = Float.MIN_VALUE; + + /** + * Any other custom params can be passed through. May be null and is null by default. + */ + public SolrParams customParams; + + public SpellingOptions() { + } + + //A couple of convenience ones + public SpellingOptions(Collection tokens, int count) { + this.tokens = tokens; + this.count = count; + } + + public SpellingOptions(Collection tokens, IndexReader reader) { + this.tokens = tokens; + this.reader = reader; + } + + public SpellingOptions(Collection tokens, IndexReader reader, int count) { + this.tokens = tokens; + this.reader = reader; + this.count = count; + } + + + public SpellingOptions(Collection tokens, IndexReader reader, int count, boolean onlyMorePopular, boolean extendedResults, float accuracy, SolrParams customParams) { + this.tokens = tokens; + this.reader = reader; + this.count = count; + this.onlyMorePopular = onlyMorePopular; + this.extendedResults = extendedResults; + this.accuracy = accuracy; + this.customParams = customParams; + } +} diff --git a/solr/src/test/org/apache/solr/handler/component/DummyCustomParamSpellChecker.java b/solr/src/test/org/apache/solr/handler/component/DummyCustomParamSpellChecker.java new file mode 100644 index 00000000000..efb79eb4ff0 --- /dev/null +++ b/solr/src/test/org/apache/solr/handler/component/DummyCustomParamSpellChecker.java @@ -0,0 +1,61 @@ +package org.apache.solr.handler.component; + +import org.apache.lucene.analysis.Token; +import org.apache.solr.core.SolrCore; +import org.apache.solr.search.SolrIndexSearcher; +import org.apache.solr.spelling.SolrSpellChecker; +import org.apache.solr.spelling.SpellingOptions; +import org.apache.solr.spelling.SpellingResult; + +import java.io.IOException; +import java.util.Collections; +import java.util.Iterator; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * A Dummy SpellChecker for testing purposes + * + **/ +public class DummyCustomParamSpellChecker extends SolrSpellChecker { + + @Override + public void reload() throws IOException { + + } + + @Override + public void build(SolrCore core, SolrIndexSearcher searcher) { + + } + + @Override + public SpellingResult getSuggestions(SpellingOptions options) throws IOException { + + SpellingResult result = new SpellingResult(); + //just spit back out the results + Iterator iterator = options.customParams.getParameterNamesIterator(); + int i = 0; + while (iterator.hasNext()){ + String name = iterator.next(); + String value = options.customParams.get(name); + result.add(new Token(name, i++, i++), Collections.singletonList(value)); + } + return result; + } +} diff --git a/solr/src/test/org/apache/solr/handler/component/SpellCheckComponentTest.java b/solr/src/test/org/apache/solr/handler/component/SpellCheckComponentTest.java index 8beac130acb..949fc8c47bb 100644 --- a/solr/src/test/org/apache/solr/handler/component/SpellCheckComponentTest.java +++ b/solr/src/test/org/apache/solr/handler/component/SpellCheckComponentTest.java @@ -24,6 +24,7 @@ import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.MapSolrParams; import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.SpellingParams; import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.core.SolrCore; @@ -33,7 +34,6 @@ import org.apache.solr.request.SolrRequestHandler; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.spelling.AbstractLuceneSpellChecker; import org.apache.solr.spelling.IndexBasedSpellChecker; -import org.apache.solr.util.AbstractSolrTestCase; import org.junit.BeforeClass; import org.junit.Test; @@ -133,9 +133,9 @@ public class SpellCheckComponentTest extends SolrTestCaseJ4 { assertTrue(cmdExec + " is not equal to " + "build", cmdExec.equals("build") == true); NamedList spellCheck = (NamedList) values.get("spellcheck"); - assertTrue("spellCheck is null and it shouldn't be", spellCheck != null); + assertNotNull(spellCheck); NamedList suggestions = (NamedList) spellCheck.get("suggestions"); - assertTrue("suggestions is null and it shouldn't be", suggestions != null); + assertNotNull(suggestions); NamedList document = (NamedList) suggestions.get("documemt"); assertEquals(1, document.get("numFound")); assertEquals(0, document.get("startOffset")); @@ -145,6 +145,50 @@ public class SpellCheckComponentTest extends SolrTestCaseJ4 { assertEquals("document", theSuggestion.iterator().next()); } + + @Test + public void testPerDictionary() throws Exception { + SolrCore core = h.getCore(); + SearchComponent speller = core.getSearchComponent("spellcheck"); + assertTrue("speller is null and it shouldn't be", speller != null); + + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(CommonParams.QT, "spellCheckCompRH"); + params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true"); + params.add(CommonParams.Q, "documemt"); + params.add(SpellCheckComponent.COMPONENT_NAME, "true"); + params.add(SpellingParams.SPELLCHECK_DICT, "perDict"); + + params.add(SpellingParams.SPELLCHECK_PREFIX + ".perDict.foo", "bar"); + params.add(SpellingParams.SPELLCHECK_PREFIX + ".perDict.bar", "foo"); + + SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH"); + SolrQueryResponse rsp = new SolrQueryResponse(); + handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp); + NamedList values = rsp.getValues(); + + NamedList spellCheck = (NamedList) values.get("spellcheck"); + NamedList suggestions = (NamedList) spellCheck.get("suggestions"); + assertNotNull("suggestions", suggestions); + NamedList suggestion; + Collection theSuggestion; + suggestion = (NamedList) suggestions.get("foo"); + assertEquals(1, suggestion.get("numFound")); + assertEquals(0, suggestion.get("startOffset")); + assertEquals(suggestion.get("endOffset"), 1); + theSuggestion = (Collection) suggestion.get("suggestion"); + assertEquals(1, theSuggestion.size()); + assertEquals("bar", theSuggestion.iterator().next()); + + suggestion = (NamedList) suggestions.get("bar"); + assertEquals(1, suggestion.get("numFound")); + assertEquals(2, suggestion.get("startOffset")); + assertEquals(3, suggestion.get("endOffset")); + theSuggestion = (Collection) suggestion.get("suggestion"); + assertEquals(1, theSuggestion.size()); + assertEquals("foo", theSuggestion.iterator().next()); + } + @Test public void testCollate() throws Exception { SolrCore core = h.getCore(); diff --git a/solr/src/test/org/apache/solr/spelling/FileBasedSpellCheckerTest.java b/solr/src/test/org/apache/solr/spelling/FileBasedSpellCheckerTest.java index 1b8bb6f4de7..4b30691b0ef 100644 --- a/solr/src/test/org/apache/solr/spelling/FileBasedSpellCheckerTest.java +++ b/solr/src/test/org/apache/solr/spelling/FileBasedSpellCheckerTest.java @@ -80,15 +80,16 @@ public class FileBasedSpellCheckerTest extends SolrTestCaseJ4 { IndexReader reader = core.getSearcher().get().getReader(); Collection tokens = queryConverter.convert("fob"); - SpellingResult result = checker.getSuggestions(tokens, reader); + SpellingOptions spellOpts = new SpellingOptions(tokens, reader); + SpellingResult result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); Map suggestions = result.get(tokens.iterator().next()); Map.Entry entry = suggestions.entrySet().iterator().next(); assertTrue(entry.getKey() + " is not equal to " + "foo", entry.getKey().equals("foo") == true); assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO); - tokens = queryConverter.convert("super"); - result = checker.getSuggestions(tokens, reader); + spellOpts.tokens = queryConverter.convert("super"); + result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); suggestions = result.get(tokens.iterator().next()); assertTrue("suggestions is not null and it should be", suggestions == null); @@ -118,7 +119,9 @@ public class FileBasedSpellCheckerTest extends SolrTestCaseJ4 { IndexReader reader = core.getSearcher().get().getReader(); Collection tokens = queryConverter.convert("Solar"); - SpellingResult result = checker.getSuggestions(tokens, reader); + + SpellingOptions spellOpts = new SpellingOptions(tokens, reader); + SpellingResult result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); //should be lowercased, b/c we are using a lowercasing analyzer Map suggestions = result.get(tokens.iterator().next()); @@ -128,8 +131,8 @@ public class FileBasedSpellCheckerTest extends SolrTestCaseJ4 { assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO); //test something not in the spell checker - tokens = queryConverter.convert("super"); - result = checker.getSuggestions(tokens, reader); + spellOpts.tokens = queryConverter.convert("super"); + result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); suggestions = result.get(tokens.iterator().next()); assertTrue("suggestions is not null and it should be", suggestions == null); @@ -160,7 +163,8 @@ public class FileBasedSpellCheckerTest extends SolrTestCaseJ4 { IndexReader reader = core.getSearcher().get().getReader(); Collection tokens = queryConverter.convert("solar"); - SpellingResult result = checker.getSuggestions(tokens, reader); + SpellingOptions spellOpts = new SpellingOptions(tokens, reader); + SpellingResult result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); //should be lowercased, b/c we are using a lowercasing analyzer Map suggestions = result.get(tokens.iterator().next()); @@ -170,10 +174,10 @@ public class FileBasedSpellCheckerTest extends SolrTestCaseJ4 { assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO); - tokens = queryConverter.convert("super"); - result = checker.getSuggestions(tokens, reader); + spellOpts.tokens = queryConverter.convert("super"); + result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); + suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("suggestions is not null and it should be", suggestions == null); } } diff --git a/solr/src/test/org/apache/solr/spelling/IndexBasedSpellCheckerTest.java b/solr/src/test/org/apache/solr/spelling/IndexBasedSpellCheckerTest.java index b0fb980bdfa..4a30c537b90 100644 --- a/solr/src/test/org/apache/solr/spelling/IndexBasedSpellCheckerTest.java +++ b/solr/src/test/org/apache/solr/spelling/IndexBasedSpellCheckerTest.java @@ -125,10 +125,11 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 { IndexReader reader = searcher.getReader(); Collection tokens = queryConverter.convert("documemt"); - SpellingResult result = checker.getSuggestions(tokens, reader); + SpellingOptions spellOpts = new SpellingOptions(tokens, reader); + SpellingResult result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); //should be lowercased, b/c we are using a lowercasing analyzer - Map suggestions = result.get(tokens.iterator().next()); + Map suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("documemt is null and it shouldn't be", suggestions != null); assertTrue("documemt Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1); Map.Entry entry = suggestions.entrySet().iterator().next(); @@ -136,32 +137,33 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 { assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO); //test something not in the spell checker - tokens = queryConverter.convert("super"); - result = checker.getSuggestions(tokens, reader); + spellOpts.tokens = queryConverter.convert("super"); + result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); + suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("suggestions is not null and it should be", suggestions == null); //test something that is spelled correctly - tokens = queryConverter.convert("document"); - result = checker.getSuggestions(tokens, reader); + spellOpts.tokens = queryConverter.convert("document"); + result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); + suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("suggestions is null and it shouldn't be", suggestions == null); //Has multiple possibilities, but the exact exists, so that should be returned - tokens = queryConverter.convert("red"); - result = checker.getSuggestions(tokens, reader, 2); - assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); + spellOpts.tokens = queryConverter.convert("red"); + spellOpts.count = 2; + result = checker.getSuggestions(spellOpts); + assertNotNull(result); + suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("suggestions is not null and it should be", suggestions == null); //Try out something which should have multiple suggestions - tokens = queryConverter.convert("bug"); - result = checker.getSuggestions(tokens, reader, 2); - assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); - assertTrue("suggestions is null and it shouldn't be", suggestions != null); + spellOpts.tokens = queryConverter.convert("bug"); + result = checker.getSuggestions(spellOpts); + assertNotNull(result); + suggestions = result.get(spellOpts.tokens.iterator().next()); + assertNotNull(suggestions); assertTrue("suggestions Size: " + suggestions.size() + " is not: " + 2, suggestions.size() == 2); entry = suggestions.entrySet().iterator().next(); @@ -198,10 +200,11 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 { IndexReader reader = searcher.getReader(); Collection tokens = queryConverter.convert("documemt"); - SpellingResult result = checker.getSuggestions(tokens, reader, 1, false, true); + SpellingOptions spellOpts = new SpellingOptions(tokens, reader, 1, false, true, 0.5f, null); + SpellingResult result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); //should be lowercased, b/c we are using a lowercasing analyzer - Map suggestions = result.get(tokens.iterator().next()); + Map suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("documemt is null and it shouldn't be", suggestions != null); assertTrue("documemt Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1); Map.Entry entry = suggestions.entrySet().iterator().next(); @@ -209,16 +212,16 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 { assertTrue(entry.getValue() + " does not equal: " + 2, entry.getValue() == 2); //test something not in the spell checker - tokens = queryConverter.convert("super"); - result = checker.getSuggestions(tokens, reader, 1, false, true); + spellOpts.tokens = queryConverter.convert("super"); + result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); + suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("suggestions is not null and it should be", suggestions == null); - tokens = queryConverter.convert("document"); - result = checker.getSuggestions(tokens, reader, 1, false, true); + spellOpts.tokens = queryConverter.convert("document"); + result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); + suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("suggestions is not null and it should be", suggestions == null); } finally { holder.decref(); @@ -304,10 +307,11 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 { IndexReader reader = searcher.getReader(); Collection tokens = queryConverter.convert("flesh"); - SpellingResult result = checker.getSuggestions(tokens, reader, 1, false, true); + SpellingOptions spellOpts = new SpellingOptions(tokens, reader, 1, false, true, 0.5f, null); + SpellingResult result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); //should be lowercased, b/c we are using a lowercasing analyzer - Map suggestions = result.get(tokens.iterator().next()); + Map suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("flesh is null and it shouldn't be", suggestions != null); assertTrue("flesh Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1); Map.Entry entry = suggestions.entrySet().iterator().next(); @@ -315,16 +319,16 @@ public class IndexBasedSpellCheckerTest extends SolrTestCaseJ4 { assertTrue(entry.getValue() + " does not equal: " + 1, entry.getValue() == 1); //test something not in the spell checker - tokens = queryConverter.convert("super"); - result = checker.getSuggestions(tokens, reader, 1, false, true); + spellOpts.tokens = queryConverter.convert("super"); + result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); + suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("suggestions is not null and it should be", suggestions == null); - tokens = queryConverter.convert("Caroline"); - result = checker.getSuggestions(tokens, reader, 1, false, true); + spellOpts.tokens = queryConverter.convert("Caroline"); + result = checker.getSuggestions(spellOpts); assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); + suggestions = result.get(spellOpts.tokens.iterator().next()); assertTrue("suggestions is not null and it should be", suggestions == null); } finally { holder.decref(); diff --git a/solr/src/test/test-files/solr/conf/solrconfig.xml b/solr/src/test/test-files/solr/conf/solrconfig.xml index e6231568231..d42edff8f9a 100644 --- a/solr/src/test/test-files/solr/conf/solrconfig.xml +++ b/solr/src/test/test-files/solr/conf/solrconfig.xml @@ -377,7 +377,11 @@ org.apache.solr.spelling.SampleComparator true - + + perDict + org.apache.solr.handler.component.DummyCustomParamSpellChecker + lowerfilt +