From bdee0a976497332dd922510c0ac459ba0dbb0af4 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 6 Jun 2011 19:14:48 +0000 Subject: [PATCH] SOLR-2462: use of spellcheck.collate could result in extremely high memory usage git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1132729 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 5 ++ .../solr/common/params/SpellingParams.java | 12 ++++- .../component/SpellCheckComponent.java | 3 +- .../solr/spelling/PossibilityIterator.java | 33 +++++++++---- .../solr/spelling/RankedSpellPossibility.java | 16 +++++- .../solr/spelling/SpellCheckCollator.java | 4 +- .../response/TestSpellCheckResponse.java | 2 +- .../solr/spelling/SpellCheckCollatorTest.java | 12 +++-- .../SpellPossibilityIteratorTest.java | 49 ++++++++++++++++--- 9 files changed, 109 insertions(+), 27 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index a30f7e700ec..97a6b5a1183 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -277,6 +277,11 @@ Bug Fixes English-specific fieldTypes (Jan Høydahl, hossman, Robert Muir, yonik, Mike McCandless) +* SOLR-2462: Fix extremely high memory usage problems with spellcheck.collate. + Separately, an additional spellcheck.maxCollationEvaluations (default=10000) + parameter is added to avoid excessive CPU time in extreme cases (e.g. long + queries with many misspelled words). (James Dyer via rmuir) + ================== 3.2.0 ================== Versions of Major Components --------------------- diff --git a/solr/src/common/org/apache/solr/common/params/SpellingParams.java b/solr/src/common/org/apache/solr/common/params/SpellingParams.java index 71d2aa5f348..50a10a2a577 100644 --- a/solr/src/common/org/apache/solr/common/params/SpellingParams.java +++ b/solr/src/common/org/apache/solr/common/params/SpellingParams.java @@ -95,7 +95,15 @@ public interface SpellingParams { * Default=0. Ignored of "spellcheck.collate" is false. *

*/ - public static final String SPELLCHECK_MAX_COLLATION_TRIES = SPELLCHECK_PREFIX + "maxCollationTries"; + public static final String SPELLCHECK_MAX_COLLATION_TRIES = SPELLCHECK_PREFIX + "maxCollationTries"; + /** + *

+ * The maximum number of word correction combinations to rank and evaluate prior to deciding which collation + * candidates to test against the index. This is a performance safety-net in cases a user enters a query with + * many misspelled words. The default is 10,000 combinations. + *

+ */ + public static final String SPELLCHECK_MAX_COLLATION_EVALUATIONS = SPELLCHECK_PREFIX + "maxCollationEvaluations"; /** *

@@ -105,7 +113,7 @@ public interface SpellingParams { *

*/ public static final String SPELLCHECK_COLLATE_EXTENDED_RESULTS = SPELLCHECK_PREFIX + "collateExtendedResults"; - + /** * Certain spelling implementations may allow for an accuracy setting. */ diff --git a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java index 897eb87af9b..559bbd81f83 100644 --- a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java +++ b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java @@ -172,11 +172,12 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar NamedList response) { int maxCollations = params.getInt(SPELLCHECK_MAX_COLLATIONS, 1); int maxCollationTries = params.getInt(SPELLCHECK_MAX_COLLATION_TRIES, 0); + int maxCollationEvaluations = params.getInt(SPELLCHECK_MAX_COLLATION_EVALUATIONS, 10000); boolean collationExtendedResults = params.getBool(SPELLCHECK_COLLATE_EXTENDED_RESULTS, false); boolean shard = params.getBool(ShardParams.IS_SHARD, false); SpellCheckCollator collator = new SpellCheckCollator(); - List collations = collator.collate(spellingResult, q, rb, maxCollations, maxCollationTries); + List collations = collator.collate(spellingResult, q, rb, maxCollations, maxCollationTries, maxCollationEvaluations); //by sorting here we guarantee a non-distributed request returns all //results in the same order as a distributed request would, //even in cases when the internal rank is the same. diff --git a/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java b/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java index ec3aaa7db94..84e41e27ede 100644 --- a/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java +++ b/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java @@ -17,12 +17,13 @@ package org.apache.solr.spelling; */ import java.util.ArrayList; -import java.util.Collections; +import java.util.Arrays; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import java.util.PriorityQueue; import org.apache.lucene.analysis.Token; @@ -38,8 +39,7 @@ import org.apache.lucene.analysis.Token; */ public class PossibilityIterator implements Iterator { private List> possibilityList = new ArrayList>(); - private List rankedPossibilityList = new ArrayList(); - private Iterator rankedPossibilityIterator; + private Iterator rankedPossibilityIterator = null; private int correctionIndex[]; private boolean done = false; @@ -56,7 +56,7 @@ public class PossibilityIterator implements Iterator { * * @param suggestions */ - public PossibilityIterator(Map> suggestions) { + public PossibilityIterator(Map> suggestions, int maximumRequiredSuggestions, int maxEvaluations) { for (Map.Entry> entry : suggestions.entrySet()) { Token token = entry.getKey(); List possibleCorrections = new ArrayList(); @@ -84,12 +84,27 @@ public class PossibilityIterator implements Iterator { correctionIndex[i] = 0; } } - - while (internalHasNext()) { - rankedPossibilityList.add(internalNext()); + + long count = 0; + PriorityQueue rankedPossibilities = new PriorityQueue(); + while (count < maxEvaluations && internalHasNext()) { + RankedSpellPossibility rsp = internalNext(); + count++; + + if(rankedPossibilities.size() >= maximumRequiredSuggestions && rsp.getRank() >= rankedPossibilities.peek().getRank()) { + continue; + } + rankedPossibilities.offer(rsp); + if(rankedPossibilities.size() > maximumRequiredSuggestions) { + rankedPossibilities.poll(); + } } - Collections.sort(rankedPossibilityList); - rankedPossibilityIterator = rankedPossibilityList.iterator(); + + RankedSpellPossibility[] rpArr = new RankedSpellPossibility[rankedPossibilities.size()]; + for(int i=rankedPossibilities.size() - 1 ; i>=0 ; i--) { + rpArr[i] = rankedPossibilities.remove(); + } + rankedPossibilityIterator = Arrays.asList(rpArr).iterator(); } private boolean internalHasNext() { diff --git a/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java b/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java index 02103f9c3f5..61093fc79a9 100644 --- a/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java +++ b/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java @@ -22,8 +22,9 @@ public class RankedSpellPossibility implements Comparable corrections; private int rank; + //Rank poorer suggestions ahead of better ones for use with a PriorityQueue public int compareTo(RankedSpellPossibility rcl) { - return new Integer(rank).compareTo(rcl.rank); + return new Integer(rcl.rank).compareTo(rank); } public List getCorrections() { @@ -41,4 +42,17 @@ public class RankedSpellPossibility implements Comparable").append(corr.getCorrection()).append(" (").append(corr.getNumberOfOccurences()).append(")"); + } + } + return sb.toString(); + } } diff --git a/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java b/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java index 6b5c37b0ef1..5ed1ab3e4c0 100644 --- a/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java +++ b/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java @@ -36,7 +36,7 @@ public class SpellCheckCollator { private static final Logger LOG = LoggerFactory.getLogger(SpellCheckCollator.class); public List collate(SpellingResult result, String originalQuery, ResponseBuilder ultimateResponse, - int maxCollations, int maxTries) { + int maxCollations, int maxTries, int maxEvaluations) { List collations = new ArrayList(); QueryComponent queryComponent = null; @@ -62,7 +62,7 @@ public class SpellCheckCollator { int tryNo = 0; int collNo = 0; - PossibilityIterator possibilityIter = new PossibilityIterator(result.getSuggestions()); + PossibilityIterator possibilityIter = new PossibilityIterator(result.getSuggestions(), maxTries, maxEvaluations); while (tryNo < maxTries && collNo < maxCollations && possibilityIter.hasNext()) { RankedSpellPossibility possibility = possibilityIter.next(); diff --git a/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java b/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java index 646d8bca727..7b11bf0f4ac 100644 --- a/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java +++ b/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java @@ -143,7 +143,7 @@ public class TestSpellCheckResponse extends SolrJettyTestBase { //Test Expanded Collation Results query.set(SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, true); - query.set(SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, 5); + query.set(SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, 10); query.set(SpellingParams.SPELLCHECK_MAX_COLLATIONS, 2); request = new QueryRequest(query); response = request.process(server).getSpellCheckResponse(); diff --git a/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java b/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java index 669676f6341..c819c9f8a8a 100644 --- a/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java +++ b/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java @@ -60,8 +60,8 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 { params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true"); params.add(SpellCheckComponent.SPELLCHECK_COUNT, "10"); params.add(SpellCheckComponent.SPELLCHECK_COLLATE, "true"); - params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "5"); - params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "2"); + params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10"); + params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10"); params.add(CommonParams.Q, "lowerfilt:(+fauth +home +loane)"); params.add(CommonParams.FQ, "NOT(id:1)"); @@ -77,8 +77,10 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 { NamedList spellCheck = (NamedList) values.get("spellcheck"); NamedList suggestions = (NamedList) spellCheck.get("suggestions"); List collations = suggestions.getAll("collation"); - assertTrue(collations.size() == 1); - assertTrue(collations.get(0).equals("lowerfilt:(+faith +hope +love)")); + assertTrue(collations.size() > 0); + for(String collation : collations) { + assertTrue(!collation.equals("lowerfilt:(+faith +hope +loaves)")); + } } @Test @@ -180,7 +182,7 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 { // combination exists. params.remove(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES); params.remove(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS); - params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "5"); + params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10"); params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "2"); handler = core.getRequestHandler("spellCheckCompRH"); rsp = new SolrQueryResponse(); diff --git a/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java b/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java index b70ba1fb95a..6cf12d76e06 100644 --- a/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java +++ b/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java @@ -28,6 +28,7 @@ import org.junit.Test; public class SpellPossibilityIteratorTest extends SolrTestCaseJ4 { private static Map> suggestions = new LinkedHashMap>(); + private static Map> lotsaSuggestions = new LinkedHashMap>(); @Override @Before @@ -72,21 +73,57 @@ public class SpellPossibilityIteratorTest extends SolrTestCaseJ4 { suggestions.put(new Token("AYE", 0, 2), AYE); suggestions.put(new Token("BEE", 0, 2), BEE); suggestions.put(new Token("CEE", 0, 2), CEE); + + lotsaSuggestions.put(new Token("AYE", 0, 2), AYE); + lotsaSuggestions.put(new Token("BEE", 0, 2), BEE); + lotsaSuggestions.put(new Token("CEE", 0, 2), CEE); + + lotsaSuggestions.put(new Token("AYE1", 0, 3), AYE); + lotsaSuggestions.put(new Token("BEE1", 0, 3), BEE); + lotsaSuggestions.put(new Token("CEE1", 0, 3), CEE); + + lotsaSuggestions.put(new Token("AYE2", 0, 3), AYE); + lotsaSuggestions.put(new Token("BEE2", 0, 3), BEE); + lotsaSuggestions.put(new Token("CEE2", 0, 3), CEE); + + lotsaSuggestions.put(new Token("AYE3", 0, 3), AYE); + lotsaSuggestions.put(new Token("BEE3", 0, 3), BEE); + lotsaSuggestions.put(new Token("CEE3", 0, 3), CEE); + + lotsaSuggestions.put(new Token("AYE4", 0, 3), AYE); + lotsaSuggestions.put(new Token("BEE4", 0, 3), BEE); + lotsaSuggestions.put(new Token("CEE4", 0, 3), CEE); + } + + @Test + public void testScalability() throws Exception { + PossibilityIterator iter = new PossibilityIterator(lotsaSuggestions, 1000, 10000); + int count = 0; + while (iter.hasNext()) { + RankedSpellPossibility rsp = iter.next(); + count++; + } + assertTrue(count==1000); } @Test public void testSpellPossibilityIterator() throws Exception { - PossibilityIterator iter = new PossibilityIterator(suggestions); + PossibilityIterator iter = new PossibilityIterator(suggestions, 1000, 10000); int count = 0; while (iter.hasNext()) { - iter.next(); + RankedSpellPossibility rsp = iter.next(); + if(count==0) { + assertTrue("I".equals(rsp.getCorrections().get(0).getCorrection())); + assertTrue("alpha".equals(rsp.getCorrections().get(1).getCorrection())); + assertTrue("one".equals(rsp.getCorrections().get(2).getCorrection())); + } count++; } assertTrue(("Three maps (8*9*10) should return 720 iterations but instead returned " + count), count == 720); suggestions.remove(new Token("CEE", 0, 2)); - iter = new PossibilityIterator(suggestions); + iter = new PossibilityIterator(suggestions, 100, 10000); count = 0; while (iter.hasNext()) { iter.next(); @@ -95,16 +132,16 @@ public class SpellPossibilityIteratorTest extends SolrTestCaseJ4 { assertTrue(("Two maps (8*9) should return 72 iterations but instead returned " + count), count == 72); suggestions.remove(new Token("BEE", 0, 2)); - iter = new PossibilityIterator(suggestions); + iter = new PossibilityIterator(suggestions, 5, 10000); count = 0; while (iter.hasNext()) { iter.next(); count++; } - assertTrue(("One map of 8 should return 8 iterations but instead returned " + count), count == 8); + assertTrue(("We requested 5 suggestions but got " + count), count == 5); suggestions.remove(new Token("AYE", 0, 2)); - iter = new PossibilityIterator(suggestions); + iter = new PossibilityIterator(suggestions, Integer.MAX_VALUE, 10000); count = 0; while (iter.hasNext()) { iter.next();