From bdee0a976497332dd922510c0ac459ba0dbb0af4 Mon Sep 17 00:00:00 2001
From: Robert Muir
Date: Mon, 6 Jun 2011 19:14:48 +0000
Subject: [PATCH] SOLR-2462: use of spellcheck.collate could result in
extremely high memory usage
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1132729 13f79535-47bb-0310-9956-ffa450edef68
---
solr/CHANGES.txt | 5 ++
.../solr/common/params/SpellingParams.java | 12 ++++-
.../component/SpellCheckComponent.java | 3 +-
.../solr/spelling/PossibilityIterator.java | 33 +++++++++----
.../solr/spelling/RankedSpellPossibility.java | 16 +++++-
.../solr/spelling/SpellCheckCollator.java | 4 +-
.../response/TestSpellCheckResponse.java | 2 +-
.../solr/spelling/SpellCheckCollatorTest.java | 12 +++--
.../SpellPossibilityIteratorTest.java | 49 ++++++++++++++++---
9 files changed, 109 insertions(+), 27 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index a30f7e700ec..97a6b5a1183 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -277,6 +277,11 @@ Bug Fixes
English-specific fieldTypes (Jan Høydahl, hossman, Robert Muir,
yonik, Mike McCandless)
+* SOLR-2462: Fix extremely high memory usage problems with spellcheck.collate.
+ Separately, an additional spellcheck.maxCollationEvaluations (default=10000)
+ parameter is added to avoid excessive CPU time in extreme cases (e.g. long
+ queries with many misspelled words). (James Dyer via rmuir)
+
================== 3.2.0 ==================
Versions of Major Components
---------------------
diff --git a/solr/src/common/org/apache/solr/common/params/SpellingParams.java b/solr/src/common/org/apache/solr/common/params/SpellingParams.java
index 71d2aa5f348..50a10a2a577 100644
--- a/solr/src/common/org/apache/solr/common/params/SpellingParams.java
+++ b/solr/src/common/org/apache/solr/common/params/SpellingParams.java
@@ -95,7 +95,15 @@ public interface SpellingParams {
* Default=0. Ignored of "spellcheck.collate" is false.
*
*/
- public static final String SPELLCHECK_MAX_COLLATION_TRIES = SPELLCHECK_PREFIX + "maxCollationTries";
+ public static final String SPELLCHECK_MAX_COLLATION_TRIES = SPELLCHECK_PREFIX + "maxCollationTries";
+ /**
+ *
+ * The maximum number of word correction combinations to rank and evaluate prior to deciding which collation
+ * candidates to test against the index. This is a performance safety-net in cases a user enters a query with
+ * many misspelled words. The default is 10,000 combinations.
+ *
+ */
+ public static final String SPELLCHECK_MAX_COLLATION_EVALUATIONS = SPELLCHECK_PREFIX + "maxCollationEvaluations";
/**
*
@@ -105,7 +113,7 @@ public interface SpellingParams {
*
*/
public static final String SPELLCHECK_COLLATE_EXTENDED_RESULTS = SPELLCHECK_PREFIX + "collateExtendedResults";
-
+
/**
* Certain spelling implementations may allow for an accuracy setting.
*/
diff --git a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
index 897eb87af9b..559bbd81f83 100644
--- a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
+++ b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
@@ -172,11 +172,12 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
NamedList response) {
int maxCollations = params.getInt(SPELLCHECK_MAX_COLLATIONS, 1);
int maxCollationTries = params.getInt(SPELLCHECK_MAX_COLLATION_TRIES, 0);
+ int maxCollationEvaluations = params.getInt(SPELLCHECK_MAX_COLLATION_EVALUATIONS, 10000);
boolean collationExtendedResults = params.getBool(SPELLCHECK_COLLATE_EXTENDED_RESULTS, false);
boolean shard = params.getBool(ShardParams.IS_SHARD, false);
SpellCheckCollator collator = new SpellCheckCollator();
- List collations = collator.collate(spellingResult, q, rb, maxCollations, maxCollationTries);
+ List collations = collator.collate(spellingResult, q, rb, maxCollations, maxCollationTries, maxCollationEvaluations);
//by sorting here we guarantee a non-distributed request returns all
//results in the same order as a distributed request would,
//even in cases when the internal rank is the same.
diff --git a/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java b/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java
index ec3aaa7db94..84e41e27ede 100644
--- a/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java
+++ b/solr/src/java/org/apache/solr/spelling/PossibilityIterator.java
@@ -17,12 +17,13 @@ package org.apache.solr.spelling;
*/
import java.util.ArrayList;
-import java.util.Collections;
+import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
+import java.util.PriorityQueue;
import org.apache.lucene.analysis.Token;
@@ -38,8 +39,7 @@ import org.apache.lucene.analysis.Token;
*/
public class PossibilityIterator implements Iterator {
private List> possibilityList = new ArrayList>();
- private List rankedPossibilityList = new ArrayList();
- private Iterator rankedPossibilityIterator;
+ private Iterator rankedPossibilityIterator = null;
private int correctionIndex[];
private boolean done = false;
@@ -56,7 +56,7 @@ public class PossibilityIterator implements Iterator {
*
* @param suggestions
*/
- public PossibilityIterator(Map> suggestions) {
+ public PossibilityIterator(Map> suggestions, int maximumRequiredSuggestions, int maxEvaluations) {
for (Map.Entry> entry : suggestions.entrySet()) {
Token token = entry.getKey();
List possibleCorrections = new ArrayList();
@@ -84,12 +84,27 @@ public class PossibilityIterator implements Iterator {
correctionIndex[i] = 0;
}
}
-
- while (internalHasNext()) {
- rankedPossibilityList.add(internalNext());
+
+ long count = 0;
+ PriorityQueue rankedPossibilities = new PriorityQueue();
+ while (count < maxEvaluations && internalHasNext()) {
+ RankedSpellPossibility rsp = internalNext();
+ count++;
+
+ if(rankedPossibilities.size() >= maximumRequiredSuggestions && rsp.getRank() >= rankedPossibilities.peek().getRank()) {
+ continue;
+ }
+ rankedPossibilities.offer(rsp);
+ if(rankedPossibilities.size() > maximumRequiredSuggestions) {
+ rankedPossibilities.poll();
+ }
}
- Collections.sort(rankedPossibilityList);
- rankedPossibilityIterator = rankedPossibilityList.iterator();
+
+ RankedSpellPossibility[] rpArr = new RankedSpellPossibility[rankedPossibilities.size()];
+ for(int i=rankedPossibilities.size() - 1 ; i>=0 ; i--) {
+ rpArr[i] = rankedPossibilities.remove();
+ }
+ rankedPossibilityIterator = Arrays.asList(rpArr).iterator();
}
private boolean internalHasNext() {
diff --git a/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java b/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java
index 02103f9c3f5..61093fc79a9 100644
--- a/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java
+++ b/solr/src/java/org/apache/solr/spelling/RankedSpellPossibility.java
@@ -22,8 +22,9 @@ public class RankedSpellPossibility implements Comparable corrections;
private int rank;
+ //Rank poorer suggestions ahead of better ones for use with a PriorityQueue
public int compareTo(RankedSpellPossibility rcl) {
- return new Integer(rank).compareTo(rcl.rank);
+ return new Integer(rcl.rank).compareTo(rank);
}
public List getCorrections() {
@@ -41,4 +42,17 @@ public class RankedSpellPossibility implements Comparable").append(corr.getCorrection()).append(" (").append(corr.getNumberOfOccurences()).append(")");
+ }
+ }
+ return sb.toString();
+ }
}
diff --git a/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java b/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java
index 6b5c37b0ef1..5ed1ab3e4c0 100644
--- a/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java
+++ b/solr/src/java/org/apache/solr/spelling/SpellCheckCollator.java
@@ -36,7 +36,7 @@ public class SpellCheckCollator {
private static final Logger LOG = LoggerFactory.getLogger(SpellCheckCollator.class);
public List collate(SpellingResult result, String originalQuery, ResponseBuilder ultimateResponse,
- int maxCollations, int maxTries) {
+ int maxCollations, int maxTries, int maxEvaluations) {
List collations = new ArrayList();
QueryComponent queryComponent = null;
@@ -62,7 +62,7 @@ public class SpellCheckCollator {
int tryNo = 0;
int collNo = 0;
- PossibilityIterator possibilityIter = new PossibilityIterator(result.getSuggestions());
+ PossibilityIterator possibilityIter = new PossibilityIterator(result.getSuggestions(), maxTries, maxEvaluations);
while (tryNo < maxTries && collNo < maxCollations && possibilityIter.hasNext()) {
RankedSpellPossibility possibility = possibilityIter.next();
diff --git a/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java b/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java
index 646d8bca727..7b11bf0f4ac 100644
--- a/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java
+++ b/solr/src/test/org/apache/solr/client/solrj/response/TestSpellCheckResponse.java
@@ -143,7 +143,7 @@ public class TestSpellCheckResponse extends SolrJettyTestBase {
//Test Expanded Collation Results
query.set(SpellingParams.SPELLCHECK_COLLATE_EXTENDED_RESULTS, true);
- query.set(SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, 5);
+ query.set(SpellingParams.SPELLCHECK_MAX_COLLATION_TRIES, 10);
query.set(SpellingParams.SPELLCHECK_MAX_COLLATIONS, 2);
request = new QueryRequest(query);
response = request.process(server).getSpellCheckResponse();
diff --git a/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java b/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java
index 669676f6341..c819c9f8a8a 100644
--- a/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java
+++ b/solr/src/test/org/apache/solr/spelling/SpellCheckCollatorTest.java
@@ -60,8 +60,8 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 {
params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true");
params.add(SpellCheckComponent.SPELLCHECK_COUNT, "10");
params.add(SpellCheckComponent.SPELLCHECK_COLLATE, "true");
- params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "5");
- params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "2");
+ params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10");
+ params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "10");
params.add(CommonParams.Q, "lowerfilt:(+fauth +home +loane)");
params.add(CommonParams.FQ, "NOT(id:1)");
@@ -77,8 +77,10 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 {
NamedList spellCheck = (NamedList) values.get("spellcheck");
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
List collations = suggestions.getAll("collation");
- assertTrue(collations.size() == 1);
- assertTrue(collations.get(0).equals("lowerfilt:(+faith +hope +love)"));
+ assertTrue(collations.size() > 0);
+ for(String collation : collations) {
+ assertTrue(!collation.equals("lowerfilt:(+faith +hope +loaves)"));
+ }
}
@Test
@@ -180,7 +182,7 @@ public class SpellCheckCollatorTest extends SolrTestCaseJ4 {
// combination exists.
params.remove(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES);
params.remove(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS);
- params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "5");
+ params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATION_TRIES, "10");
params.add(SpellCheckComponent.SPELLCHECK_MAX_COLLATIONS, "2");
handler = core.getRequestHandler("spellCheckCompRH");
rsp = new SolrQueryResponse();
diff --git a/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java b/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java
index b70ba1fb95a..6cf12d76e06 100644
--- a/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java
+++ b/solr/src/test/org/apache/solr/spelling/SpellPossibilityIteratorTest.java
@@ -28,6 +28,7 @@ import org.junit.Test;
public class SpellPossibilityIteratorTest extends SolrTestCaseJ4 {
private static Map> suggestions = new LinkedHashMap>();
+ private static Map> lotsaSuggestions = new LinkedHashMap>();
@Override
@Before
@@ -72,21 +73,57 @@ public class SpellPossibilityIteratorTest extends SolrTestCaseJ4 {
suggestions.put(new Token("AYE", 0, 2), AYE);
suggestions.put(new Token("BEE", 0, 2), BEE);
suggestions.put(new Token("CEE", 0, 2), CEE);
+
+ lotsaSuggestions.put(new Token("AYE", 0, 2), AYE);
+ lotsaSuggestions.put(new Token("BEE", 0, 2), BEE);
+ lotsaSuggestions.put(new Token("CEE", 0, 2), CEE);
+
+ lotsaSuggestions.put(new Token("AYE1", 0, 3), AYE);
+ lotsaSuggestions.put(new Token("BEE1", 0, 3), BEE);
+ lotsaSuggestions.put(new Token("CEE1", 0, 3), CEE);
+
+ lotsaSuggestions.put(new Token("AYE2", 0, 3), AYE);
+ lotsaSuggestions.put(new Token("BEE2", 0, 3), BEE);
+ lotsaSuggestions.put(new Token("CEE2", 0, 3), CEE);
+
+ lotsaSuggestions.put(new Token("AYE3", 0, 3), AYE);
+ lotsaSuggestions.put(new Token("BEE3", 0, 3), BEE);
+ lotsaSuggestions.put(new Token("CEE3", 0, 3), CEE);
+
+ lotsaSuggestions.put(new Token("AYE4", 0, 3), AYE);
+ lotsaSuggestions.put(new Token("BEE4", 0, 3), BEE);
+ lotsaSuggestions.put(new Token("CEE4", 0, 3), CEE);
+ }
+
+ @Test
+ public void testScalability() throws Exception {
+ PossibilityIterator iter = new PossibilityIterator(lotsaSuggestions, 1000, 10000);
+ int count = 0;
+ while (iter.hasNext()) {
+ RankedSpellPossibility rsp = iter.next();
+ count++;
+ }
+ assertTrue(count==1000);
}
@Test
public void testSpellPossibilityIterator() throws Exception {
- PossibilityIterator iter = new PossibilityIterator(suggestions);
+ PossibilityIterator iter = new PossibilityIterator(suggestions, 1000, 10000);
int count = 0;
while (iter.hasNext()) {
- iter.next();
+ RankedSpellPossibility rsp = iter.next();
+ if(count==0) {
+ assertTrue("I".equals(rsp.getCorrections().get(0).getCorrection()));
+ assertTrue("alpha".equals(rsp.getCorrections().get(1).getCorrection()));
+ assertTrue("one".equals(rsp.getCorrections().get(2).getCorrection()));
+ }
count++;
}
assertTrue(("Three maps (8*9*10) should return 720 iterations but instead returned " + count), count == 720);
suggestions.remove(new Token("CEE", 0, 2));
- iter = new PossibilityIterator(suggestions);
+ iter = new PossibilityIterator(suggestions, 100, 10000);
count = 0;
while (iter.hasNext()) {
iter.next();
@@ -95,16 +132,16 @@ public class SpellPossibilityIteratorTest extends SolrTestCaseJ4 {
assertTrue(("Two maps (8*9) should return 72 iterations but instead returned " + count), count == 72);
suggestions.remove(new Token("BEE", 0, 2));
- iter = new PossibilityIterator(suggestions);
+ iter = new PossibilityIterator(suggestions, 5, 10000);
count = 0;
while (iter.hasNext()) {
iter.next();
count++;
}
- assertTrue(("One map of 8 should return 8 iterations but instead returned " + count), count == 8);
+ assertTrue(("We requested 5 suggestions but got " + count), count == 5);
suggestions.remove(new Token("AYE", 0, 2));
- iter = new PossibilityIterator(suggestions);
+ iter = new PossibilityIterator(suggestions, Integer.MAX_VALUE, 10000);
count = 0;
while (iter.hasNext()) {
iter.next();