diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 9d17c5245f2..2e1679d6d0a 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -170,6 +170,8 @@ Improvements SOLR_IP_WHITELIST and SOLR_IP_BLACKLIST. These variables can restrict access to Solr based on IP addresses/networks. (rmuir) +* SOLR-14131: Add maxQueryLength option to DirectSolrSpellchecker. (Andy Webb via Bruno Roustant) + Optimizations --------------------- (No changes) diff --git a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java index a29d80d1be2..527a3da4745 100644 --- a/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java +++ b/solr/core/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java @@ -54,6 +54,7 @@ import org.slf4j.LoggerFactory; * can be specified as "freq". *
  • thresholdTokenFrequency: sets {@link DirectSpellChecker#setThresholdFrequency(float)}. *
  • minQueryLength: sets {@link DirectSpellChecker#setMinQueryLength(int)}. + *
  • maxQueryLength: sets {@link DirectSpellChecker#setMaxQueryLength(int)}. *
  • maxQueryFrequency: sets {@link DirectSpellChecker#setMaxQueryFrequency(float)}. * * @see DirectSpellChecker @@ -86,6 +87,9 @@ public class DirectSolrSpellChecker extends SolrSpellChecker { public static final String MINQUERYLENGTH = "minQueryLength"; public static final int DEFAULT_MINQUERYLENGTH = 4; + public static final String MAXQUERYLENGTH = "maxQueryLength"; + public static final int DEFAULT_MAXQUERYLENGTH = Integer.MAX_VALUE; + public static final String MAXQUERYFREQUENCY = "maxQueryFrequency"; public static final float DEFAULT_MAXQUERYFREQUENCY = 0.01f; @@ -144,6 +148,11 @@ public class DirectSolrSpellChecker extends SolrSpellChecker { Integer queryLength = params.getInt(MINQUERYLENGTH); if (queryLength != null) minQueryLength = queryLength; + + int maxQueryLength = DEFAULT_MAXQUERYLENGTH; + Integer overriddenMaxQueryLength = params.getInt(MAXQUERYLENGTH); + if (overriddenMaxQueryLength != null) + maxQueryLength = overriddenMaxQueryLength; float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY; Float queryFreq = params.getFloat(MAXQUERYFREQUENCY); @@ -158,6 +167,7 @@ public class DirectSolrSpellChecker extends SolrSpellChecker { checker.setThresholdFrequency(minThreshold); checker.setMaxInspections(maxInspections); checker.setMinQueryLength(minQueryLength); + checker.setMaxQueryLength(maxQueryLength); checker.setMaxQueryFrequency(maxQueryFrequency); checker.setLowerCaseTerms(false); diff --git a/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java b/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java index 0e4cc9ade6f..6106fb4bb78 100644 --- a/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java +++ b/solr/core/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java @@ -62,20 +62,25 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 { checker.init(spellchecker, core); h.getCore().withSearcher(searcher -> { + + // check that 'fob' is corrected to 'foo' Collection tokens = queryConverter.convert("fob"); SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.getIndexReader()); SpellingResult result = checker.getSuggestions(spellOpts); - assertTrue("result is null and it shouldn't be", result != null); + assertNotNull("result shouldn't be null", result); Map suggestions = result.get(tokens.iterator().next()); + assertFalse("suggestions shouldn't be empty", suggestions.isEmpty()); Map.Entry entry = suggestions.entrySet().iterator().next(); - assertTrue(entry.getKey() + " is not equal to " + "foo", entry.getKey().equals("foo") == true); + assertEquals("foo", entry.getKey()); assertFalse(entry.getValue() + " equals: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO); + // check that 'super' is *not* corrected spellOpts.tokens = queryConverter.convert("super"); result = checker.getSuggestions(spellOpts); - assertTrue("result is null and it shouldn't be", result != null); - suggestions = result.get(tokens.iterator().next()); - assertTrue("suggestions is not null and it should be", suggestions == null); + assertNotNull("result shouldn't be null", result); + suggestions = result.get(spellOpts.tokens.iterator().next()); + assertNotNull("suggestions shouldn't be null", suggestions); + assertTrue("suggestions should be empty", suggestions.isEmpty()); return null; }); } @@ -88,6 +93,46 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 { "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='fox']/arr[@name='suggestion']/lst/int[@name='freq']=2", "//lst[@name='spellcheck']/bool[@name='correctlySpelled']='true'" ); - } + } + + @Test + public void testMaxQueryLength() throws Exception { + testMaxQueryLength(true); + testMaxQueryLength(false); + } + + private void testMaxQueryLength(Boolean limitQueryLength) throws Exception { + + DirectSolrSpellChecker checker = new DirectSolrSpellChecker(); + NamedList spellchecker = new NamedList<>(); + spellchecker.add("classname", DirectSolrSpellChecker.class.getName()); + spellchecker.add(SolrSpellChecker.FIELD, "teststop"); + spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, 2); + + // demonstrate that "anothar" is not corrected when maxQueryLength is set to a small number + if (limitQueryLength) spellchecker.add(DirectSolrSpellChecker.MAXQUERYLENGTH, 4); + + SolrCore core = h.getCore(); + checker.init(spellchecker, core); + + h.getCore().withSearcher(searcher -> { + Collection tokens = queryConverter.convert("anothar"); + SpellingOptions spellOpts = new SpellingOptions(tokens, searcher.getIndexReader()); + SpellingResult result = checker.getSuggestions(spellOpts); + assertNotNull("result shouldn't be null", result); + Map suggestions = result.get(tokens.iterator().next()); + assertNotNull("suggestions shouldn't be null", suggestions); + + if (limitQueryLength) { + assertTrue("suggestions should be empty", suggestions.isEmpty()); + } else { + assertFalse("suggestions shouldn't be empty", suggestions.isEmpty()); + Map.Entry entry = suggestions.entrySet().iterator().next(); + assertEquals("another", entry.getKey()); + } + + return null; + }); + } } diff --git a/solr/solr-ref-guide/src/spell-checking.adoc b/solr/solr-ref-guide/src/spell-checking.adoc index c883ed93968..c480b348526 100644 --- a/solr/solr-ref-guide/src/spell-checking.adoc +++ b/solr/solr-ref-guide/src/spell-checking.adoc @@ -69,6 +69,7 @@ The `DirectSolrSpellChecker` uses terms from the Solr index without building a p 1 5 4 + 40 0.01 .01 @@ -81,7 +82,7 @@ Many of the parameters relate to how this spell checker should query the index f Because this spell checker is querying the main index, you may want to limit how often it queries the index to be sure to avoid any performance conflicts with user queries. The `accuracy` setting defines the threshold for a valid suggestion, while `maxEdits` defines the number of changes to the term to allow. Since most spelling mistakes are only 1 letter off, setting this to 1 will reduce the number of possible suggestions (the default, however, is 2); the value can only be 1 or 2. `minPrefix` defines the minimum number of characters the terms should share. Setting this to 1 means that the spelling suggestions will all start with the same letter, for example. -The `maxInspections` parameter defines the maximum number of possible matches to review before returning results; the default is 5. `minQueryLength` defines how many characters must be in the query before suggestions are provided; the default is 4. +The `maxInspections` parameter defines the maximum number of possible matches to review before returning results; the default is 5. `minQueryLength` defines how many characters must be in the query before suggestions are provided; the default is 4. `maxQueryLength` enables the spell checker to skip over very long query terms, which can avoid expensive operations or exceptions. There is no limit to term length by default. At first, spellchecker analyses incoming query words by looking up them in the index. Only query words, which are absent in index or too rare ones (below `maxQueryFrequency`) are considered as misspelled and used for finding suggestions. Words which are frequent than `maxQueryFrequency` bypass spellchecker unchanged. After suggestions for every misspelled word are found they are filtered for enough frequency with `thresholdTokenFrequency` as boundary value. These parameters (`maxQueryFrequency` and `thresholdTokenFrequency`) can be a percentage (such as .01, or 1%) or an absolute value (such as 4).