SOLR-2571: Fix DirectSolrSpellchecker's numeric params to use numeric types, add example and test for thresholdTokenFrequency

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1132855 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-06-07 02:38:42 +00:00
parent d280169fef
commit 6b6e5e6553
6 changed files with 115 additions and 31 deletions

View File

@ -94,9 +94,9 @@ New Features
* SOLR-792: Adding PivotFacetComponent for Hierarchical faceting * SOLR-792: Adding PivotFacetComponent for Hierarchical faceting
(erik, Jeremy Hinegardner, Thibaut Lassalle, ryan) (erik, Jeremy Hinegardner, Thibaut Lassalle, ryan)
* LUCENE-2507: Added DirectSolrSpellChecker, which uses Lucene's DirectSpellChecker * LUCENE-2507, SOLR-2571: Added DirectSolrSpellChecker, which uses Lucene's DirectSpellChecker
to retrieve correction candidates directly from the term dictionary using to retrieve correction candidates directly from the term dictionary using
levenshtein automata. (rmuir) levenshtein automata. (James Dyer, rmuir)
* SOLR-1873: SolrCloud - added shared/central config and core/shard managment via zookeeper, * SOLR-1873: SolrCloud - added shared/central config and core/shard managment via zookeeper,
built-in load balancing, and infrastructure for future SolrCloud work. (yonik, Mark Miller) built-in load balancing, and infrastructure for future SolrCloud work. (yonik, Mark Miller)
@ -282,6 +282,13 @@ Bug Fixes
parameter is added to avoid excessive CPU time in extreme cases (e.g. long parameter is added to avoid excessive CPU time in extreme cases (e.g. long
queries with many misspelled words). (James Dyer via rmuir) queries with many misspelled words). (James Dyer via rmuir)
Other Changes
----------------------
* SOLR-2571: Add a commented out example of the spellchecker's thresholdTokenFrequency
parameter to the example solrconfig.xml, and also add a unit test for this feature.
(James Dyer via rmuir)
================== 3.2.0 ================== ================== 3.2.0 ==================
Versions of Major Components Versions of Major Components
--------------------- ---------------------

View File

@ -1084,6 +1084,9 @@
<str name="name">default</str> <str name="name">default</str>
<str name="field">name</str> <str name="field">name</str>
<str name="spellcheckIndexDir">spellchecker</str> <str name="spellcheckIndexDir">spellchecker</str>
<!-- uncomment this to require terms to occur in 1% of the documents in order to be included in the dictionary
<float name="thresholdTokenFrequency">.01</float>
-->
</lst> </lst>
<!-- a spellchecker that uses no auxiliary on disk index --> <!-- a spellchecker that uses no auxiliary on disk index -->

View File

@ -59,31 +59,29 @@ import org.slf4j.LoggerFactory;
public class DirectSolrSpellChecker extends SolrSpellChecker { public class DirectSolrSpellChecker extends SolrSpellChecker {
private static final Logger LOG = LoggerFactory.getLogger(DirectSolrSpellChecker.class); private static final Logger LOG = LoggerFactory.getLogger(DirectSolrSpellChecker.class);
/** Field to use as the source of terms */ // configuration params shared with other spellcheckers
public static final String FIELD = "field"; public static final String COMPARATOR_CLASS = AbstractLuceneSpellChecker.COMPARATOR_CLASS;
public static final String SCORE_COMP = AbstractLuceneSpellChecker.SCORE_COMP;
public static final String FREQ_COMP = AbstractLuceneSpellChecker.FREQ_COMP;
public static final String FIELD = AbstractLuceneSpellChecker.FIELD;
public static final String STRING_DISTANCE = AbstractLuceneSpellChecker.STRING_DISTANCE;
public static final String ACCURACY = AbstractLuceneSpellChecker.ACCURACY;
public static final String THRESHOLD_TOKEN_FREQUENCY = IndexBasedSpellChecker.THRESHOLD_TOKEN_FREQUENCY;
public static final String STRING_DISTANCE = "distanceMeasure";
public static final String INTERNAL_DISTANCE = "internal"; public static final String INTERNAL_DISTANCE = "internal";
public static final String ACCURACY = "accuracy";
public static final float DEFAULT_ACCURACY = 0.5f; public static final float DEFAULT_ACCURACY = 0.5f;
public static final float DEFAULT_THRESHOLD_TOKEN_FREQUENCY = 0.0f;
public static final String MAXEDITS = "maxEdits"; public static final String MAXEDITS = "maxEdits";
public static final int DEFAULT_MAXEDITS = 2; public static final int DEFAULT_MAXEDITS = 2;
// params specific to this implementation
public static final String MINPREFIX = "minPrefix"; public static final String MINPREFIX = "minPrefix";
public static final int DEFAULT_MINPREFIX = 1; public static final int DEFAULT_MINPREFIX = 1;
public static final String MAXINSPECTIONS = "maxInspections"; public static final String MAXINSPECTIONS = "maxInspections";
public static final int DEFAULT_MAXINSPECTIONS = 5; public static final int DEFAULT_MAXINSPECTIONS = 5;
public static final String COMPARATOR_CLASS = "comparatorClass";
public static final String SCORE_COMP = "score";
public static final String FREQ_COMP = "freq";
public static final String THRESHOLD = "thresholdTokenFrequency";
public static final float DEFAULT_THRESHOLD = 0.0f;
public static final String MINQUERYLENGTH = "minQueryLength"; public static final String MINQUERYLENGTH = "minQueryLength";
public static final int DEFAULT_MINQUERYLENGTH = 4; public static final int DEFAULT_MINQUERYLENGTH = 4;
@ -117,39 +115,39 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
field = (String) config.get(FIELD); field = (String) config.get(FIELD);
float minAccuracy = DEFAULT_ACCURACY; float minAccuracy = DEFAULT_ACCURACY;
String accuracy = (String) config.get(ACCURACY); Float accuracy = (Float) config.get(ACCURACY);
if (accuracy != null) if (accuracy != null)
minAccuracy = Float.parseFloat(accuracy); minAccuracy = accuracy;
int maxEdits = DEFAULT_MAXEDITS; int maxEdits = DEFAULT_MAXEDITS;
String edits = (String) config.get(MAXEDITS); Integer edits = (Integer) config.get(MAXEDITS);
if (edits != null) if (edits != null)
maxEdits = Integer.parseInt(edits); maxEdits = edits;
int minPrefix = DEFAULT_MINPREFIX; int minPrefix = DEFAULT_MINPREFIX;
String prefix = (String) config.get(MINPREFIX); Integer prefix = (Integer) config.get(MINPREFIX);
if (prefix != null) if (prefix != null)
minPrefix = Integer.parseInt(prefix); minPrefix = prefix;
int maxInspections = DEFAULT_MAXINSPECTIONS; int maxInspections = DEFAULT_MAXINSPECTIONS;
String inspections = (String) config.get(MAXINSPECTIONS); Integer inspections = (Integer) config.get(MAXINSPECTIONS);
if (inspections != null) if (inspections != null)
maxInspections = Integer.parseInt(inspections); maxInspections = inspections;
float minThreshold = DEFAULT_THRESHOLD; float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
String threshold = (String) config.get(THRESHOLD); Float threshold = (Float) config.get(THRESHOLD_TOKEN_FREQUENCY);
if (threshold != null) if (threshold != null)
minThreshold = Float.parseFloat(threshold); minThreshold = threshold;
int minQueryLength = DEFAULT_MINQUERYLENGTH; int minQueryLength = DEFAULT_MINQUERYLENGTH;
String queryLength = (String) config.get(MINQUERYLENGTH); Integer queryLength = (Integer) config.get(MINQUERYLENGTH);
if (queryLength != null) if (queryLength != null)
minQueryLength = Integer.parseInt(queryLength); minQueryLength = queryLength;
float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY; float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
String queryFreq = (String) config.get(MAXQUERYFREQUENCY); Float queryFreq = (Float) config.get(MAXQUERYFREQUENCY);
if (queryFreq != null) if (queryFreq != null)
maxQueryFrequency = Float.parseFloat(queryFreq); maxQueryFrequency = queryFreq;
checker.setComparator(comp); checker.setComparator(comp);
checker.setDistance(sd); checker.setDistance(sd);

View File

@ -49,6 +49,21 @@ Config for testing spellcheck component
<str name="spellcheckIndexDir">spellchecker1</str> <str name="spellcheckIndexDir">spellchecker1</str>
<str name="buildOnCommit">true</str> <str name="buildOnCommit">true</str>
</lst> </lst>
<lst name="spellchecker">
<str name="name">threshold</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">spellcheckerThreshold</str>
<str name="buildOnCommit">true</str>
<float name="thresholdTokenFrequency">.29</float>
</lst>
<lst name="spellchecker">
<str name="name">threshold_direct</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">spellcheckerThreshold</str>
<str name="buildOnCommit">true</str>
<float name="thresholdTokenFrequency">.29</float>
</lst>
<lst name="spellchecker"> <lst name="spellchecker">
<str name="name">multipleFields</str> <str name="name">multipleFields</str>
<str name="field">lowerfilt1and2</str> <str name="field">lowerfilt1and2</str>

View File

@ -21,10 +21,15 @@ import java.io.File;
import java.util.*; import java.util.*;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SpellingParams; import org.apache.solr.common.params.SpellingParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrCore;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.spelling.AbstractLuceneSpellChecker; import org.apache.solr.spelling.AbstractLuceneSpellChecker;
import org.junit.BeforeClass; import org.junit.BeforeClass;
@ -188,4 +193,60 @@ public class SpellCheckComponentTest extends SolrTestCaseJ4 {
assertQ(req, "//arr[@name='suggestion'][.='lucenejava']"); assertQ(req, "//arr[@name='suggestion'][.='lucenejava']");
} }
@Test
public void testThresholdTokenFrequency() throws Exception {
//"document" is in 2 documents but "another" is only in 1.
//So with a threshold of 15%, "another" is absent from the dictionary
//while "document" is present.
assertJQ(req("qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","documenq", SpellCheckComponent.SPELLCHECK_DICT, "threshold", SpellCheckComponent.SPELLCHECK_COUNT,"5", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true")
,"/spellcheck/suggestions/[1]/suggestion==[{'word':'document','freq':2}]"
);
//TODO: DirectSolrSpellChecker returns a different format. Is this OK? Does SOLRJ need tweaking to handle this???
assertJQ(req("qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","documenq", SpellCheckComponent.SPELLCHECK_DICT, "threshold_direct", SpellCheckComponent.SPELLCHECK_COUNT,"5", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true")
,"/spellcheck/suggestions/[1]/suggestion==['document']]"
);
//TODO: how do we make this into a 1-liner using "assertQ()" ???
SolrCore core = h.getCore();
SearchComponent speller = core.getSearchComponent("spellcheck");
assertTrue("speller is null and it shouldn't be", speller != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
params.add(SpellCheckComponent.SPELLCHECK_COUNT, "10");
params.add(SpellCheckComponent.SPELLCHECK_DICT, "threshold");
params.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true");
params.add(CommonParams.Q, "anotheq");
SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH");
SolrQueryResponse rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp);
req.close();
NamedList values = rsp.getValues();
NamedList spellCheck = (NamedList) values.get("spellcheck");
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
assertTrue(suggestions.get("suggestion")==null);
assertTrue((Boolean) suggestions.get("correctlySpelled")==false);
params.remove(SpellCheckComponent.SPELLCHECK_DICT);
params.add(SpellCheckComponent.SPELLCHECK_DICT, "threshold_direct");
rsp = new SolrQueryResponse();
rsp.add("responseHeader", new SimpleOrderedMap());
req = new LocalSolrQueryRequest(core, params);
handler.handleRequest(req, rsp);
req.close();
values = rsp.getValues();
spellCheck = (NamedList) values.get("spellcheck");
suggestions = (NamedList) spellCheck.get("suggestions");
assertTrue(suggestions.get("suggestion")==null);
//TODO: Why is DirectSolrSpellChecker returning "true" here? Is that OK?
//assertTrue((Boolean) suggestions.get("correctlySpelled")==false);
}
} }

View File

@ -55,7 +55,7 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 {
NamedList spellchecker = new NamedList(); NamedList spellchecker = new NamedList();
spellchecker.add("classname", DirectSolrSpellChecker.class.getName()); spellchecker.add("classname", DirectSolrSpellChecker.class.getName());
spellchecker.add(DirectSolrSpellChecker.FIELD, "teststop"); spellchecker.add(DirectSolrSpellChecker.FIELD, "teststop");
spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, "2"); // we will try "fob" spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, 2); // we will try "fob"
SolrCore core = h.getCore(); SolrCore core = h.getCore();
checker.init(spellchecker, core); checker.init(spellchecker, core);