SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker.

Closes #1113
This commit is contained in:
Bruno Roustant 2019-12-24 11:46:00 +01:00
parent 72c99e921c
commit 27840562a6
No known key found for this signature in database
GPG Key ID: CD28DABB95360525
4 changed files with 65 additions and 7 deletions

View File

@ -170,6 +170,8 @@ Improvements
SOLR_IP_WHITELIST and SOLR_IP_BLACKLIST. These variables can restrict access to SOLR_IP_WHITELIST and SOLR_IP_BLACKLIST. These variables can restrict access to
Solr based on IP addresses/networks. (rmuir) Solr based on IP addresses/networks. (rmuir)
* SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker. (Andy Webb via Bruno Roustant)
Optimizations Optimizations
--------------------- ---------------------
(No changes) (No changes)

View File

@ -54,6 +54,7 @@ import org.slf4j.LoggerFactory;
* can be specified as "freq". * can be specified as "freq".
* <li>thresholdTokenFrequency: sets {@link DirectSpellChecker#setThresholdFrequency(float)}. * <li>thresholdTokenFrequency: sets {@link DirectSpellChecker#setThresholdFrequency(float)}.
* <li>minQueryLength: sets {@link DirectSpellChecker#setMinQueryLength(int)}. * <li>minQueryLength: sets {@link DirectSpellChecker#setMinQueryLength(int)}.
* <li>maxQueryLength: sets {@link DirectSpellChecker#setMaxQueryLength(int)}.
* <li>maxQueryFrequency: sets {@link DirectSpellChecker#setMaxQueryFrequency(float)}. * <li>maxQueryFrequency: sets {@link DirectSpellChecker#setMaxQueryFrequency(float)}.
* </ul> * </ul>
* @see DirectSpellChecker * @see DirectSpellChecker
@ -86,6 +87,9 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
public static final String MINQUERYLENGTH = "minQueryLength"; public static final String MINQUERYLENGTH = "minQueryLength";
public static final int DEFAULT_MINQUERYLENGTH = 4; public static final int DEFAULT_MINQUERYLENGTH = 4;
public static final String MAXQUERYLENGTH = "maxQueryLength";
public static final int DEFAULT_MAXQUERYLENGTH = Integer.MAX_VALUE;
public static final String MAXQUERYFREQUENCY = "maxQueryFrequency"; public static final String MAXQUERYFREQUENCY = "maxQueryFrequency";
public static final float DEFAULT_MAXQUERYFREQUENCY = 0.01f; public static final float DEFAULT_MAXQUERYFREQUENCY = 0.01f;
@ -144,6 +148,11 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
Integer queryLength = params.getInt(MINQUERYLENGTH); Integer queryLength = params.getInt(MINQUERYLENGTH);
if (queryLength != null) if (queryLength != null)
minQueryLength = queryLength; minQueryLength = queryLength;
int maxQueryLength = DEFAULT_MAXQUERYLENGTH;
Integer overriddenMaxQueryLength = params.getInt(MAXQUERYLENGTH);
if (overriddenMaxQueryLength != null)
maxQueryLength = overriddenMaxQueryLength;
float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY; float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
Float queryFreq = params.getFloat(MAXQUERYFREQUENCY); Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
@ -158,6 +167,7 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
checker.setThresholdFrequency(minThreshold); checker.setThresholdFrequency(minThreshold);
checker.setMaxInspections(maxInspections); checker.setMaxInspections(maxInspections);
checker.setMinQueryLength(minQueryLength); checker.setMinQueryLength(minQueryLength);
checker.setMaxQueryLength(maxQueryLength);
checker.setMaxQueryFrequency(maxQueryFrequency); checker.setMaxQueryFrequency(maxQueryFrequency);
checker.setLowerCaseTerms(false); checker.setLowerCaseTerms(false);

View File

@ -62,20 +62,25 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 {
checker.init(spellchecker, core); checker.init(spellchecker, core);
h.getCore().withSearcher(searcher -> { h.getCore().withSearcher(searcher -> {
// check that 'fob' is corrected to 'foo'
Collection<Token> tokens = queryConverter.convert("fob"); Collection<Token> tokens = queryConverter.convert("fob");
SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.getIndexReader()); SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.getIndexReader());
SpellingResult result = checker.getSuggestions(spellOpts); SpellingResult result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null); assertNotNull("result shouldn't be null", result);
Map<String, Integer> suggestions = result.get(tokens.iterator().next()); Map<String, Integer> suggestions = result.get(tokens.iterator().next());
assertFalse("suggestions shouldn't be empty", suggestions.isEmpty());
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next(); Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is not equal to " + "foo", entry.getKey().equals("foo") == true); assertEquals("foo", entry.getKey());
assertFalse(entry.getValue() + " equals: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO); assertFalse(entry.getValue() + " equals: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
// check that 'super' is *not* corrected
spellOpts.tokens = queryConverter.convert("super"); spellOpts.tokens = queryConverter.convert("super");
result = checker.getSuggestions(spellOpts); result = checker.getSuggestions(spellOpts);
assertTrue("result is null and it shouldn't be", result != null); assertNotNull("result shouldn't be null", result);
suggestions = result.get(tokens.iterator().next()); suggestions = result.get(spellOpts.tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null); assertNotNull("suggestions shouldn't be null", suggestions);
assertTrue("suggestions should be empty", suggestions.isEmpty());
return null; return null;
}); });
} }
@ -88,6 +93,46 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 {
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='fox']/arr[@name='suggestion']/lst/int[@name='freq']=2", "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='fox']/arr[@name='suggestion']/lst/int[@name='freq']=2",
"//lst[@name='spellcheck']/bool[@name='correctlySpelled']='true'" "//lst[@name='spellcheck']/bool[@name='correctlySpelled']='true'"
); );
} }
@Test
public void testMaxQueryLength() throws Exception {
testMaxQueryLength(true);
testMaxQueryLength(false);
}
private void testMaxQueryLength(Boolean limitQueryLength) throws Exception {
DirectSolrSpellChecker checker = new DirectSolrSpellChecker();
NamedList<Object> spellchecker = new NamedList<>();
spellchecker.add("classname", DirectSolrSpellChecker.class.getName());
spellchecker.add(SolrSpellChecker.FIELD, "teststop");
spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, 2);
// demonstrate that "anothar" is not corrected when maxQueryLength is set to a small number
if (limitQueryLength) spellchecker.add(DirectSolrSpellChecker.MAXQUERYLENGTH, 4);
SolrCore core = h.getCore();
checker.init(spellchecker, core);
h.getCore().withSearcher(searcher -> {
Collection<Token> tokens = queryConverter.convert("anothar");
SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.getIndexReader());
SpellingResult result = checker.getSuggestions(spellOpts);
assertNotNull("result shouldn't be null", result);
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
assertNotNull("suggestions shouldn't be null", suggestions);
if (limitQueryLength) {
assertTrue("suggestions should be empty", suggestions.isEmpty());
} else {
assertFalse("suggestions shouldn't be empty", suggestions.isEmpty());
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertEquals("another", entry.getKey());
}
return null;
});
}
} }

View File

@ -69,6 +69,7 @@ The `DirectSolrSpellChecker` uses terms from the Solr index without building a p
<int name="minPrefix">1</int> <int name="minPrefix">1</int>
<int name="maxInspections">5</int> <int name="maxInspections">5</int>
<int name="minQueryLength">4</int> <int name="minQueryLength">4</int>
<int name="maxQueryLength">40</int>
<float name="maxQueryFrequency">0.01</float> <float name="maxQueryFrequency">0.01</float>
<float name="thresholdTokenFrequency">.01</float> <float name="thresholdTokenFrequency">.01</float>
</lst> </lst>
@ -81,7 +82,7 @@ Many of the parameters relate to how this spell checker should query the index f
Because this spell checker is querying the main index, you may want to limit how often it queries the index to be sure to avoid any performance conflicts with user queries. The `accuracy` setting defines the threshold for a valid suggestion, while `maxEdits` defines the number of changes to the term to allow. Since most spelling mistakes are only 1 letter off, setting this to 1 will reduce the number of possible suggestions (the default, however, is 2); the value can only be 1 or 2. `minPrefix` defines the minimum number of characters the terms should share. Setting this to 1 means that the spelling suggestions will all start with the same letter, for example. Because this spell checker is querying the main index, you may want to limit how often it queries the index to be sure to avoid any performance conflicts with user queries. The `accuracy` setting defines the threshold for a valid suggestion, while `maxEdits` defines the number of changes to the term to allow. Since most spelling mistakes are only 1 letter off, setting this to 1 will reduce the number of possible suggestions (the default, however, is 2); the value can only be 1 or 2. `minPrefix` defines the minimum number of characters the terms should share. Setting this to 1 means that the spelling suggestions will all start with the same letter, for example.
The `maxInspections` parameter defines the maximum number of possible matches to review before returning results; the default is 5. `minQueryLength` defines how many characters must be in the query before suggestions are provided; the default is 4. The `maxInspections` parameter defines the maximum number of possible matches to review before returning results; the default is 5. `minQueryLength` defines how many characters must be in the query before suggestions are provided; the default is 4. `maxQueryLength` enables the spell checker to skip over very long query terms, which can avoid expensive operations or exceptions. There is no limit to term length by default.
At first, spellchecker analyses incoming query words by looking up them in the index. Only query words, which are absent in index or too rare ones (below `maxQueryFrequency`) are considered as misspelled and used for finding suggestions. Words which are frequent than `maxQueryFrequency` bypass spellchecker unchanged. After suggestions for every misspelled word are found they are filtered for enough frequency with `thresholdTokenFrequency` as boundary value. These parameters (`maxQueryFrequency` and `thresholdTokenFrequency`) can be a percentage (such as .01, or 1%) or an absolute value (such as 4). At first, spellchecker analyses incoming query words by looking up them in the index. Only query words, which are absent in index or too rare ones (below `maxQueryFrequency`) are considered as misspelled and used for finding suggestions. Words which are frequent than `maxQueryFrequency` bypass spellchecker unchanged. After suggestions for every misspelled word are found they are filtered for enough frequency with `thresholdTokenFrequency` as boundary value. These parameters (`maxQueryFrequency` and `thresholdTokenFrequency`) can be a percentage (such as .01, or 1%) or an absolute value (such as 4).