SOLR-2571: Fix DirectSolrSpellchecker's numeric params to use numeric types, add example and test for thresholdTokenFrequency

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1132855 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-06-07 02:38:42 +00:00
parent d280169fef
commit 6b6e5e6553
6 changed files with 115 additions and 31 deletions

View File

@ -94,9 +94,9 @@ New Features
* SOLR-792: Adding PivotFacetComponent for Hierarchical faceting
(erik, Jeremy Hinegardner, Thibaut Lassalle, ryan)
* LUCENE-2507: Added DirectSolrSpellChecker, which uses Lucene's DirectSpellChecker
* LUCENE-2507, SOLR-2571: Added DirectSolrSpellChecker, which uses Lucene's DirectSpellChecker
to retrieve correction candidates directly from the term dictionary using
levenshtein automata. (rmuir)
levenshtein automata. (James Dyer, rmuir)
* SOLR-1873: SolrCloud - added shared/central config and core/shard managment via zookeeper,
built-in load balancing, and infrastructure for future SolrCloud work. (yonik, Mark Miller)
@ -282,6 +282,13 @@ Bug Fixes
parameter is added to avoid excessive CPU time in extreme cases (e.g. long
queries with many misspelled words). (James Dyer via rmuir)
Other Changes
----------------------
* SOLR-2571: Add a commented out example of the spellchecker's thresholdTokenFrequency
parameter to the example solrconfig.xml, and also add a unit test for this feature.
(James Dyer via rmuir)
================== 3.2.0 ==================
Versions of Major Components
---------------------

View File

@ -1084,6 +1084,9 @@
<str name="name">default</str>
<str name="field">name</str>
<str name="spellcheckIndexDir">spellchecker</str>
<!-- uncomment this to require terms to occur in 1% of the documents in order to be included in the dictionary
<float name="thresholdTokenFrequency">.01</float>
-->
</lst>
<!-- a spellchecker that uses no auxiliary on disk index -->

View File

@ -59,31 +59,29 @@ import org.slf4j.LoggerFactory;
public class DirectSolrSpellChecker extends SolrSpellChecker {
private static final Logger LOG = LoggerFactory.getLogger(DirectSolrSpellChecker.class);
/** Field to use as the source of terms */
public static final String FIELD = "field";
// configuration params shared with other spellcheckers
public static final String COMPARATOR_CLASS = AbstractLuceneSpellChecker.COMPARATOR_CLASS;
public static final String SCORE_COMP = AbstractLuceneSpellChecker.SCORE_COMP;
public static final String FREQ_COMP = AbstractLuceneSpellChecker.FREQ_COMP;
public static final String FIELD = AbstractLuceneSpellChecker.FIELD;
public static final String STRING_DISTANCE = AbstractLuceneSpellChecker.STRING_DISTANCE;
public static final String ACCURACY = AbstractLuceneSpellChecker.ACCURACY;
public static final String THRESHOLD_TOKEN_FREQUENCY = IndexBasedSpellChecker.THRESHOLD_TOKEN_FREQUENCY;
public static final String STRING_DISTANCE = "distanceMeasure";
public static final String INTERNAL_DISTANCE = "internal";
public static final String ACCURACY = "accuracy";
public static final float DEFAULT_ACCURACY = 0.5f;
public static final float DEFAULT_THRESHOLD_TOKEN_FREQUENCY = 0.0f;
public static final String MAXEDITS = "maxEdits";
public static final int DEFAULT_MAXEDITS = 2;
// params specific to this implementation
public static final String MINPREFIX = "minPrefix";
public static final int DEFAULT_MINPREFIX = 1;
public static final String MAXINSPECTIONS = "maxInspections";
public static final int DEFAULT_MAXINSPECTIONS = 5;
public static final String COMPARATOR_CLASS = "comparatorClass";
public static final String SCORE_COMP = "score";
public static final String FREQ_COMP = "freq";
public static final String THRESHOLD = "thresholdTokenFrequency";
public static final float DEFAULT_THRESHOLD = 0.0f;
public static final String MINQUERYLENGTH = "minQueryLength";
public static final int DEFAULT_MINQUERYLENGTH = 4;
@ -117,39 +115,39 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
field = (String) config.get(FIELD);
float minAccuracy = DEFAULT_ACCURACY;
String accuracy = (String) config.get(ACCURACY);
Float accuracy = (Float) config.get(ACCURACY);
if (accuracy != null)
minAccuracy = Float.parseFloat(accuracy);
minAccuracy = accuracy;
int maxEdits = DEFAULT_MAXEDITS;
String edits = (String) config.get(MAXEDITS);
Integer edits = (Integer) config.get(MAXEDITS);
if (edits != null)
maxEdits = Integer.parseInt(edits);
maxEdits = edits;
int minPrefix = DEFAULT_MINPREFIX;
String prefix = (String) config.get(MINPREFIX);
Integer prefix = (Integer) config.get(MINPREFIX);
if (prefix != null)
minPrefix = Integer.parseInt(prefix);
minPrefix = prefix;
int maxInspections = DEFAULT_MAXINSPECTIONS;
String inspections = (String) config.get(MAXINSPECTIONS);
Integer inspections = (Integer) config.get(MAXINSPECTIONS);
if (inspections != null)
maxInspections = Integer.parseInt(inspections);
maxInspections = inspections;
float minThreshold = DEFAULT_THRESHOLD;
String threshold = (String) config.get(THRESHOLD);
float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
Float threshold = (Float) config.get(THRESHOLD_TOKEN_FREQUENCY);
if (threshold != null)
minThreshold = Float.parseFloat(threshold);
minThreshold = threshold;
int minQueryLength = DEFAULT_MINQUERYLENGTH;
String queryLength = (String) config.get(MINQUERYLENGTH);
Integer queryLength = (Integer) config.get(MINQUERYLENGTH);
if (queryLength != null)
minQueryLength = Integer.parseInt(queryLength);
minQueryLength = queryLength;
float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
String queryFreq = (String) config.get(MAXQUERYFREQUENCY);
Float queryFreq = (Float) config.get(MAXQUERYFREQUENCY);
if (queryFreq != null)
maxQueryFrequency = Float.parseFloat(queryFreq);
maxQueryFrequency = queryFreq;
checker.setComparator(comp);
checker.setDistance(sd);

View File

@ -49,6 +49,21 @@ Config for testing spellcheck component
<str name="spellcheckIndexDir">spellchecker1</str>
<str name="buildOnCommit">true</str>
</lst>
<lst name="spellchecker">
<str name="name">threshold</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">spellcheckerThreshold</str>
<str name="buildOnCommit">true</str>
<float name="thresholdTokenFrequency">.29</float>
</lst>
<lst name="spellchecker">
<str name="name">threshold_direct</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">spellcheckerThreshold</str>
<str name="buildOnCommit">true</str>
<float name="thresholdTokenFrequency">.29</float>
</lst>
<lst name="spellchecker">
<str name="name">multipleFields</str>
<str name="field">lowerfilt1and2</str>

View File

@ -21,10 +21,15 @@ import java.io.File;
import java.util.*;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.spelling.AbstractLuceneSpellChecker;
import org.junit.BeforeClass;
@ -188,4 +193,60 @@ public class SpellCheckComponentTest extends SolrTestCaseJ4 {
assertQ(req, "//arr[@name='suggestion'][.='lucenejava']");
}
@Test
public void testThresholdTokenFrequency() throws Exception {
//"document" is in 2 documents but "another" is only in 1.
//So with a threshold of 15%, "another" is absent from the dictionary
//while "document" is present.
assertJQ(req("qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","documenq", SpellCheckComponent.SPELLCHECK_DICT, "threshold", SpellCheckComponent.SPELLCHECK_COUNT,"5", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true")
,"/spellcheck/suggestions/[1]/suggestion==[{'word':'document','freq':2}]"
);
//TODO: DirectSolrSpellChecker returns a different format. Is this OK? Does SOLRJ need tweaking to handle this???
assertJQ(req("qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","documenq", SpellCheckComponent.SPELLCHECK_DICT, "threshold_direct", SpellCheckComponent.SPELLCHECK_COUNT,"5", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true")
,"/spellcheck/suggestions/[1]/suggestion==['document']]"
);
//TODO: how do we make this into a 1-liner using "assertQ()" ???
SolrCore core = h.getCore();
SearchComponent speller = core.getSearchComponent("spellcheck");
assertTrue("speller is null and it shouldn't be", speller != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
params.add(SpellCheckComponent.SPELLCHECK_COUNT, "10");
params.add(SpellCheckComponent.SPELLCHECK_DICT, "threshold");
params.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true");
params.add(CommonParams.Q, "anotheq");
SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH");
SolrQueryResponse rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp);
req.close();
NamedList values = rsp.getValues();
NamedList spellCheck = (NamedList) values.get("spellcheck");
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
assertTrue(suggestions.get("suggestion")==null);
assertTrue((Boolean) suggestions.get("correctlySpelled")==false);
params.remove(SpellCheckComponent.SPELLCHECK_DICT);
params.add(SpellCheckComponent.SPELLCHECK_DICT, "threshold_direct");
rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp);
req.close();
values = rsp.getValues();
spellCheck = (NamedList) values.get("spellcheck");
suggestions = (NamedList) spellCheck.get("suggestions");
assertTrue(suggestions.get("suggestion")==null);
//TODO: Why is DirectSolrSpellChecker returning "true" here? Is that OK?
//assertTrue((Boolean) suggestions.get("correctlySpelled")==false);
}
}

View File

@ -55,7 +55,7 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 {
NamedList spellchecker = new NamedList();
spellchecker.add("classname", DirectSolrSpellChecker.class.getName());
spellchecker.add(DirectSolrSpellChecker.FIELD, "teststop");
spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, "2"); // we will try "fob"
spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, 2); // we will try "fob"
SolrCore core = h.getCore();
checker.init(spellchecker, core);