SOLR-2462: use of spellcheck.collate could result in extremely high memory usage

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1132729 13f79535-47bb-0310-9956-ffa450edef68
2011-06-06 19:14:48 +00:00 · 2011-06-06 19:14:48 +00:00 · bdee0a9764
parent 63083944b1
commit bdee0a9764
9 changed files with 109 additions and 27 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -277,6 +277,11 @@ Bug Fixes
  English-specific fieldTypes (Jan Høydahl, hossman, Robert Muir,
  yonik, Mike McCandless)
 * SOLR-2462: Fix extremely high memory usage problems with spellcheck.collate.
  Separately, an additional spellcheck.maxCollationEvaluations (default=10000)
  parameter is added to avoid excessive CPU time in extreme cases (e.g. long
  queries with many misspelled words).  (James Dyer via rmuir)
 ==================  3.2.0  ==================
 Versions of Major Components
 ---------------------
--- a/solr/src/common/org/apache/solr/common/params/SpellingParams.java
+++ b/solr/src/common/org/apache/solr/common/params/SpellingParams.java
@ -95,7 +95,15 @@ public interface SpellingParams {
   * Default=0. Ignored of "spellcheck.collate" is false. 
   * </p>
   */
-  public static final String SPELLCHECK_MAX_COLLATION_TRIES = SPELLCHECK_PREFIX + "maxCollationTries";
+  public static final String SPELLCHECK_MAX_COLLATION_TRIES = SPELLCHECK_PREFIX + "maxCollationTries";  
  /**
   * <p>
   * The maximum number of word correction combinations to rank and evaluate prior to deciding which collation
   * candidates to test against the index.  This is a performance safety-net in cases a user enters a query with
   * many misspelled words.  The default is 10,000 combinations. 
   * </p>
   */
  public static final String SPELLCHECK_MAX_COLLATION_EVALUATIONS = SPELLCHECK_PREFIX + "maxCollationEvaluations";
  /**
   * <p>
@ -105,7 +113,7 @@ public interface SpellingParams {
   * </p>
   */
  public static final String SPELLCHECK_COLLATE_EXTENDED_RESULTS = SPELLCHECK_PREFIX + "collateExtendedResults";
-  
+    
  /**
   * Certain spelling implementations may allow for an accuracy setting.
   */
--- a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
+++ b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
@ -172,11 +172,12 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
 			NamedList response) {
 		int maxCollations = params.getInt(SPELLCHECK_MAX_COLLATIONS, 1);
 		int maxCollationTries = params.getInt(SPELLCHECK_MAX_COLLATION_TRIES, 0);
 		int maxCollationEvaluations = params.getInt(SPELLCHECK_MAX_COLLATION_EVALUATIONS, 10000);
 		boolean collationExtendedResults = params.getBool(SPELLCHECK_COLLATE_EXTENDED_RESULTS, false);
 		boolean shard = params.getBool(ShardParams.IS_SHARD, false);
 		SpellCheckCollator collator = new SpellCheckCollator();
-		List<SpellCheckCollation> collations = collator.collate(spellingResult, q, rb, maxCollations, maxCollationTries);
+		List<SpellCheckCollation> collations = collator.collate(spellingResult, q, rb, maxCollations, maxCollationTries, maxCollationEvaluations);
 		//by sorting here we guarantee a non-distributed request returns all 
 		//results in the same order as a distributed request would, 
 		//even in cases when the internal rank is the same.
--- a/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java
+++ b/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java
@ -17,12 +17,13 @@ package org.apache.solr.spelling;
 */
 import java.util.ArrayList;
-import java.util.Collections;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.NoSuchElementException;
 import java.util.PriorityQueue;
 import org.apache.lucene.analysis.Token;
@ -38,8 +39,7 @@ import org.apache.lucene.analysis.Token;
 */
 public class PossibilityIterator implements Iterator<RankedSpellPossibility> {
 	private List<List<SpellCheckCorrection>> possibilityList = new ArrayList<List<SpellCheckCorrection>>();
-	private List<RankedSpellPossibility> rankedPossibilityList = new ArrayList<RankedSpellPossibility>();
+	private Iterator<RankedSpellPossibility> rankedPossibilityIterator = null;
 	private Iterator<RankedSpellPossibility> rankedPossibilityIterator;
 	private int correctionIndex[];
 	private boolean done = false;
@ -56,7 +56,7 @@ public class PossibilityIterator implements Iterator<RankedSpellPossibility> {
 	 * 
 	 * @param suggestions
 	 */
-	public PossibilityIterator(Map<Token, LinkedHashMap<String, Integer>> suggestions) {
+	public PossibilityIterator(Map<Token, LinkedHashMap<String, Integer>> suggestions, int maximumRequiredSuggestions, int maxEvaluations) {
 		for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
 			Token token = entry.getKey();
 			List<SpellCheckCorrection> possibleCorrections = new ArrayList<SpellCheckCorrection>();
@ -84,12 +84,27 @@ public class PossibilityIterator implements Iterator<RankedSpellPossibility> {
 				correctionIndex[i] = 0;
 			}
 		}
-
+		
-		while (internalHasNext()) {
+		long count = 0;
-			rankedPossibilityList.add(internalNext());
+		PriorityQueue<RankedSpellPossibility> rankedPossibilities = new PriorityQueue<RankedSpellPossibility>();		
 		while (count < maxEvaluations && internalHasNext()) {
 			RankedSpellPossibility rsp = internalNext();
 			count++;			
 			if(rankedPossibilities.size() >= maximumRequiredSuggestions && rsp.getRank() >= rankedPossibilities.peek().getRank()) {
 				continue;
 			}
 			rankedPossibilities.offer(rsp);
 			if(rankedPossibilities.size() > maximumRequiredSuggestions) {
 				rankedPossibilities.poll();
 			}
 		}
-		Collections.sort(rankedPossibilityList);
+		
-		rankedPossibilityIterator = rankedPossibilityList.iterator();
+		RankedSpellPossibility[] rpArr = new RankedSpellPossibility[rankedPossibilities.size()];
 		for(int i=rankedPossibilities.size() - 1  ; i>=0 ; i--) {
 			rpArr[i] = rankedPossibilities.remove();
 		}
 		rankedPossibilityIterator = Arrays.asList(rpArr).iterator();		
 	}
 	private boolean internalHasNext() {
--- a/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java
+++ b/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java
@ -22,8 +22,9 @@ public class RankedSpellPossibility implements Comparable<RankedSpellPossibility
 	private List<SpellCheckCorrection> corrections;
 	private int rank;
 	//Rank poorer suggestions ahead of better ones for use with a PriorityQueue
 	public int compareTo(RankedSpellPossibility rcl) {
-		return new Integer(rank).compareTo(rcl.rank);
+		return new Integer(rcl.rank).compareTo(rank);		
 	}
 	public List<SpellCheckCorrection> getCorrections() {
@ -41,4 +42,17 @@ public class RankedSpellPossibility implements Comparable<RankedSpellPossibility
 	public void setRank(int rank) {
 		this.rank = rank;
 	}
 	public String toString() {
 		StringBuilder sb = new StringBuilder();
 		sb.append("rank=").append(rank);
 		if(corrections != null) {
 			for(SpellCheckCorrection corr : corrections) {
 				sb.append("     ");
 				sb.append(corr.getOriginal()).append(">").append(corr.getCorrection()).append(" (").append(corr.getNumberOfOccurences()).append(")");
 			}
 		}
 		return sb.toString();
 	}
 }
--- a/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java
+++ b/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java
@ -36,7 +36,7 @@ public class SpellCheckCollator {
  private static final Logger LOG = LoggerFactory.getLogger(SpellCheckCollator.class);
  public List<SpellCheckCollation> collate(SpellingResult result, String originalQuery, ResponseBuilder ultimateResponse,
-                                           int maxCollations, int maxTries) {
+                                           int maxCollations, int maxTries, int maxEvaluations) {
    List<SpellCheckCollation> collations = new ArrayList<SpellCheckCollation>();
    QueryComponent queryComponent = null;
@ -62,7 +62,7 @@ public class SpellCheckCollator {
    int tryNo = 0;
    int collNo = 0;
-    PossibilityIterator possibilityIter = new PossibilityIterator(result.getSuggestions());
+    PossibilityIterator possibilityIter = new PossibilityIterator(result.getSuggestions(), maxTries, maxEvaluations);
    while (tryNo < maxTries && collNo < maxCollations && possibilityIter.hasNext()) {
      RankedSpellPossibility possibility = possibilityIter.next();
--- a/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java
+++ b/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java
@ -143,7 +143,7 @@ public class TestSpellCheckResponse extends SolrJettyTestBase {
    //Test Expanded Collation Results
    query.set(SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, true);
-    query.set(SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, 5);
+    query.set(SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, 10);
    query.set(SpellingParams.SPELLCHECK_MAX_COLLATIONS, 2); 
    request = new QueryRequest(query);
    response = request.process(server).getSpellCheckResponse();
--- a/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java
+++ b/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java
@ -60,8 +60,8 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 {
 		params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true");
 		params.add(SpellCheckComponent.SPELLCHECK_COUNT, "10");		
 		params.add(SpellCheckComponent.SPELLCHECK_COLLATE, "true");
-		params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "5");
+		params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10");
-		params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "2");
+		params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10");
 		params.add(CommonParams.Q, "lowerfilt:(+fauth +home +loane)");
 		params.add(CommonParams.FQ, "NOT(id:1)");
@ -77,8 +77,10 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 {
 		NamedList spellCheck = (NamedList) values.get("spellcheck");
 		NamedList suggestions = (NamedList) spellCheck.get("suggestions");
 		List<String> collations = suggestions.getAll("collation");
-		assertTrue(collations.size() == 1);
+		assertTrue(collations.size() > 0);
-		assertTrue(collations.get(0).equals("lowerfilt:(+faith +hope +love)"));		
+		for(String collation : collations) {
 			assertTrue(!collation.equals("lowerfilt:(+faith +hope +loaves)"));	
 		}
 	}
 	@Test
@ -180,7 +182,7 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 {
 		// combination exists.
 		params.remove(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES);
 		params.remove(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS);
-		params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "5");
+		params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10");
 		params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "2");
 		handler = core.getRequestHandler("spellCheckCompRH");
 		rsp = new SolrQueryResponse();
--- a/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java
+++ b/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java
@ -28,6 +28,7 @@ import org.junit.Test;
 public class SpellPossibilityIteratorTest extends SolrTestCaseJ4 {
 	private static Map<Token, LinkedHashMap<String, Integer>> suggestions = new LinkedHashMap<Token, LinkedHashMap<String, Integer>>();
 	private static Map<Token, LinkedHashMap<String, Integer>> lotsaSuggestions = new LinkedHashMap<Token, LinkedHashMap<String, Integer>>();
 	@Override
  @Before
@ -72,21 +73,57 @@ public class SpellPossibilityIteratorTest extends SolrTestCaseJ4 {
 		suggestions.put(new Token("AYE", 0, 2), AYE);
 		suggestions.put(new Token("BEE", 0, 2), BEE);
 		suggestions.put(new Token("CEE", 0, 2), CEE);
 		lotsaSuggestions.put(new Token("AYE", 0, 2), AYE);
 		lotsaSuggestions.put(new Token("BEE", 0, 2), BEE);
 		lotsaSuggestions.put(new Token("CEE", 0, 2), CEE);
 		lotsaSuggestions.put(new Token("AYE1", 0, 3), AYE);
 		lotsaSuggestions.put(new Token("BEE1", 0, 3), BEE);
 		lotsaSuggestions.put(new Token("CEE1", 0, 3), CEE);
 		lotsaSuggestions.put(new Token("AYE2", 0, 3), AYE);
 		lotsaSuggestions.put(new Token("BEE2", 0, 3), BEE);
 		lotsaSuggestions.put(new Token("CEE2", 0, 3), CEE);
 		lotsaSuggestions.put(new Token("AYE3", 0, 3), AYE);
 		lotsaSuggestions.put(new Token("BEE3", 0, 3), BEE);
 		lotsaSuggestions.put(new Token("CEE3", 0, 3), CEE);
 		lotsaSuggestions.put(new Token("AYE4", 0, 3), AYE);
 		lotsaSuggestions.put(new Token("BEE4", 0, 3), BEE);
 		lotsaSuggestions.put(new Token("CEE4", 0, 3), CEE);
 	}
 	@Test
 	public void testScalability() throws Exception {
 		PossibilityIterator iter = new PossibilityIterator(lotsaSuggestions, 1000, 10000);
 		int count = 0;
 		while (iter.hasNext()) {			
 			RankedSpellPossibility rsp = iter.next();
 			count++;
 		}
 		assertTrue(count==1000);
 	}
 	@Test
 	public void testSpellPossibilityIterator() throws Exception {
-		PossibilityIterator iter = new PossibilityIterator(suggestions);
+		PossibilityIterator iter = new PossibilityIterator(suggestions, 1000, 10000);
 		int count = 0;
 		while (iter.hasNext()) {
-			iter.next();
+			RankedSpellPossibility rsp = iter.next();
 			if(count==0) {
 				assertTrue("I".equals(rsp.getCorrections().get(0).getCorrection()));
 				assertTrue("alpha".equals(rsp.getCorrections().get(1).getCorrection()));
 				assertTrue("one".equals(rsp.getCorrections().get(2).getCorrection()));
 			}
 			count++;
 		}
 		assertTrue(("Three maps (8*9*10) should return 720 iterations but instead returned " + count), count == 720);
 		suggestions.remove(new Token("CEE", 0, 2));
-		iter = new PossibilityIterator(suggestions);
+		iter = new PossibilityIterator(suggestions, 100, 10000);
 		count = 0;
 		while (iter.hasNext()) {
 			iter.next();
@ -95,16 +132,16 @@ public class SpellPossibilityIteratorTest extends SolrTestCaseJ4 {
 		assertTrue(("Two maps (8*9) should return 72 iterations but instead returned " + count), count == 72);
 		suggestions.remove(new Token("BEE", 0, 2));
-		iter = new PossibilityIterator(suggestions);
+		iter = new PossibilityIterator(suggestions, 5, 10000);
 		count = 0;
 		while (iter.hasNext()) {
 			iter.next();
 			count++;
 		}
-		assertTrue(("One map of 8 should return 8 iterations but instead returned " + count), count == 8);
+		assertTrue(("We requested 5 suggestions but got " + count), count == 5);
 		suggestions.remove(new Token("AYE", 0, 2));
-		iter = new PossibilityIterator(suggestions);
+		iter = new PossibilityIterator(suggestions, Integer.MAX_VALUE, 10000);
 		count = 0;
 		while (iter.hasNext()) {
 			iter.next();