mirror of https://github.com/apache/lucene.git
SOLR-2571: Fix DirectSolrSpellchecker's numeric params to use numeric types, add example and test for thresholdTokenFrequency
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1132855 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d280169fef
commit
6b6e5e6553
|
@ -94,9 +94,9 @@ New Features
|
|||
* SOLR-792: Adding PivotFacetComponent for Hierarchical faceting
|
||||
(erik, Jeremy Hinegardner, Thibaut Lassalle, ryan)
|
||||
|
||||
* LUCENE-2507: Added DirectSolrSpellChecker, which uses Lucene's DirectSpellChecker
|
||||
* LUCENE-2507, SOLR-2571: Added DirectSolrSpellChecker, which uses Lucene's DirectSpellChecker
|
||||
to retrieve correction candidates directly from the term dictionary using
|
||||
levenshtein automata. (rmuir)
|
||||
levenshtein automata. (James Dyer, rmuir)
|
||||
|
||||
* SOLR-1873: SolrCloud - added shared/central config and core/shard managment via zookeeper,
|
||||
built-in load balancing, and infrastructure for future SolrCloud work. (yonik, Mark Miller)
|
||||
|
@ -282,6 +282,13 @@ Bug Fixes
|
|||
parameter is added to avoid excessive CPU time in extreme cases (e.g. long
|
||||
queries with many misspelled words). (James Dyer via rmuir)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
||||
* SOLR-2571: Add a commented out example of the spellchecker's thresholdTokenFrequency
|
||||
parameter to the example solrconfig.xml, and also add a unit test for this feature.
|
||||
(James Dyer via rmuir)
|
||||
|
||||
================== 3.2.0 ==================
|
||||
Versions of Major Components
|
||||
---------------------
|
||||
|
|
|
@ -1084,6 +1084,9 @@
|
|||
<str name="name">default</str>
|
||||
<str name="field">name</str>
|
||||
<str name="spellcheckIndexDir">spellchecker</str>
|
||||
<!-- uncomment this to require terms to occur in 1% of the documents in order to be included in the dictionary
|
||||
<float name="thresholdTokenFrequency">.01</float>
|
||||
-->
|
||||
</lst>
|
||||
|
||||
<!-- a spellchecker that uses no auxiliary on disk index -->
|
||||
|
|
|
@ -59,31 +59,29 @@ import org.slf4j.LoggerFactory;
|
|||
public class DirectSolrSpellChecker extends SolrSpellChecker {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DirectSolrSpellChecker.class);
|
||||
|
||||
/** Field to use as the source of terms */
|
||||
public static final String FIELD = "field";
|
||||
// configuration params shared with other spellcheckers
|
||||
public static final String COMPARATOR_CLASS = AbstractLuceneSpellChecker.COMPARATOR_CLASS;
|
||||
public static final String SCORE_COMP = AbstractLuceneSpellChecker.SCORE_COMP;
|
||||
public static final String FREQ_COMP = AbstractLuceneSpellChecker.FREQ_COMP;
|
||||
public static final String FIELD = AbstractLuceneSpellChecker.FIELD;
|
||||
public static final String STRING_DISTANCE = AbstractLuceneSpellChecker.STRING_DISTANCE;
|
||||
public static final String ACCURACY = AbstractLuceneSpellChecker.ACCURACY;
|
||||
public static final String THRESHOLD_TOKEN_FREQUENCY = IndexBasedSpellChecker.THRESHOLD_TOKEN_FREQUENCY;
|
||||
|
||||
public static final String STRING_DISTANCE = "distanceMeasure";
|
||||
public static final String INTERNAL_DISTANCE = "internal";
|
||||
|
||||
public static final String ACCURACY = "accuracy";
|
||||
public static final float DEFAULT_ACCURACY = 0.5f;
|
||||
public static final float DEFAULT_THRESHOLD_TOKEN_FREQUENCY = 0.0f;
|
||||
|
||||
public static final String MAXEDITS = "maxEdits";
|
||||
public static final int DEFAULT_MAXEDITS = 2;
|
||||
|
||||
// params specific to this implementation
|
||||
public static final String MINPREFIX = "minPrefix";
|
||||
public static final int DEFAULT_MINPREFIX = 1;
|
||||
|
||||
public static final String MAXINSPECTIONS = "maxInspections";
|
||||
public static final int DEFAULT_MAXINSPECTIONS = 5;
|
||||
|
||||
public static final String COMPARATOR_CLASS = "comparatorClass";
|
||||
public static final String SCORE_COMP = "score";
|
||||
public static final String FREQ_COMP = "freq";
|
||||
|
||||
public static final String THRESHOLD = "thresholdTokenFrequency";
|
||||
public static final float DEFAULT_THRESHOLD = 0.0f;
|
||||
|
||||
public static final String MINQUERYLENGTH = "minQueryLength";
|
||||
public static final int DEFAULT_MINQUERYLENGTH = 4;
|
||||
|
||||
|
@ -117,39 +115,39 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
|
|||
field = (String) config.get(FIELD);
|
||||
|
||||
float minAccuracy = DEFAULT_ACCURACY;
|
||||
String accuracy = (String) config.get(ACCURACY);
|
||||
Float accuracy = (Float) config.get(ACCURACY);
|
||||
if (accuracy != null)
|
||||
minAccuracy = Float.parseFloat(accuracy);
|
||||
minAccuracy = accuracy;
|
||||
|
||||
int maxEdits = DEFAULT_MAXEDITS;
|
||||
String edits = (String) config.get(MAXEDITS);
|
||||
Integer edits = (Integer) config.get(MAXEDITS);
|
||||
if (edits != null)
|
||||
maxEdits = Integer.parseInt(edits);
|
||||
maxEdits = edits;
|
||||
|
||||
int minPrefix = DEFAULT_MINPREFIX;
|
||||
String prefix = (String) config.get(MINPREFIX);
|
||||
Integer prefix = (Integer) config.get(MINPREFIX);
|
||||
if (prefix != null)
|
||||
minPrefix = Integer.parseInt(prefix);
|
||||
minPrefix = prefix;
|
||||
|
||||
int maxInspections = DEFAULT_MAXINSPECTIONS;
|
||||
String inspections = (String) config.get(MAXINSPECTIONS);
|
||||
Integer inspections = (Integer) config.get(MAXINSPECTIONS);
|
||||
if (inspections != null)
|
||||
maxInspections = Integer.parseInt(inspections);
|
||||
maxInspections = inspections;
|
||||
|
||||
float minThreshold = DEFAULT_THRESHOLD;
|
||||
String threshold = (String) config.get(THRESHOLD);
|
||||
float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
|
||||
Float threshold = (Float) config.get(THRESHOLD_TOKEN_FREQUENCY);
|
||||
if (threshold != null)
|
||||
minThreshold = Float.parseFloat(threshold);
|
||||
minThreshold = threshold;
|
||||
|
||||
int minQueryLength = DEFAULT_MINQUERYLENGTH;
|
||||
String queryLength = (String) config.get(MINQUERYLENGTH);
|
||||
Integer queryLength = (Integer) config.get(MINQUERYLENGTH);
|
||||
if (queryLength != null)
|
||||
minQueryLength = Integer.parseInt(queryLength);
|
||||
minQueryLength = queryLength;
|
||||
|
||||
float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
|
||||
String queryFreq = (String) config.get(MAXQUERYFREQUENCY);
|
||||
Float queryFreq = (Float) config.get(MAXQUERYFREQUENCY);
|
||||
if (queryFreq != null)
|
||||
maxQueryFrequency = Float.parseFloat(queryFreq);
|
||||
maxQueryFrequency = queryFreq;
|
||||
|
||||
checker.setComparator(comp);
|
||||
checker.setDistance(sd);
|
||||
|
|
|
@ -49,6 +49,21 @@ Config for testing spellcheck component
|
|||
<str name="spellcheckIndexDir">spellchecker1</str>
|
||||
<str name="buildOnCommit">true</str>
|
||||
</lst>
|
||||
<lst name="spellchecker">
|
||||
<str name="name">threshold</str>
|
||||
<str name="field">lowerfilt</str>
|
||||
<str name="spellcheckIndexDir">spellcheckerThreshold</str>
|
||||
<str name="buildOnCommit">true</str>
|
||||
<float name="thresholdTokenFrequency">.29</float>
|
||||
</lst>
|
||||
<lst name="spellchecker">
|
||||
<str name="name">threshold_direct</str>
|
||||
<str name="classname">solr.DirectSolrSpellChecker</str>
|
||||
<str name="field">lowerfilt</str>
|
||||
<str name="spellcheckIndexDir">spellcheckerThreshold</str>
|
||||
<str name="buildOnCommit">true</str>
|
||||
<float name="thresholdTokenFrequency">.29</float>
|
||||
</lst>
|
||||
<lst name="spellchecker">
|
||||
<str name="name">multipleFields</str>
|
||||
<str name="field">lowerfilt1and2</str>
|
||||
|
|
|
@ -21,10 +21,15 @@ import java.io.File;
|
|||
import java.util.*;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.params.SpellingParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.request.SolrRequestHandler;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.spelling.AbstractLuceneSpellChecker;
|
||||
import org.junit.BeforeClass;
|
||||
|
@ -188,4 +193,60 @@ public class SpellCheckComponentTest extends SolrTestCaseJ4 {
|
|||
|
||||
assertQ(req, "//arr[@name='suggestion'][.='lucenejava']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testThresholdTokenFrequency() throws Exception {
|
||||
|
||||
//"document" is in 2 documents but "another" is only in 1.
|
||||
//So with a threshold of 15%, "another" is absent from the dictionary
|
||||
//while "document" is present.
|
||||
|
||||
assertJQ(req("qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","documenq", SpellCheckComponent.SPELLCHECK_DICT, "threshold", SpellCheckComponent.SPELLCHECK_COUNT,"5", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true")
|
||||
,"/spellcheck/suggestions/[1]/suggestion==[{'word':'document','freq':2}]"
|
||||
);
|
||||
|
||||
//TODO: DirectSolrSpellChecker returns a different format. Is this OK? Does SOLRJ need tweaking to handle this???
|
||||
assertJQ(req("qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","documenq", SpellCheckComponent.SPELLCHECK_DICT, "threshold_direct", SpellCheckComponent.SPELLCHECK_COUNT,"5", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true")
|
||||
,"/spellcheck/suggestions/[1]/suggestion==['document']]"
|
||||
);
|
||||
|
||||
//TODO: how do we make this into a 1-liner using "assertQ()" ???
|
||||
SolrCore core = h.getCore();
|
||||
SearchComponent speller = core.getSearchComponent("spellcheck");
|
||||
assertTrue("speller is null and it shouldn't be", speller != null);
|
||||
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
|
||||
params.add(SpellCheckComponent.SPELLCHECK_COUNT, "10");
|
||||
params.add(SpellCheckComponent.SPELLCHECK_DICT, "threshold");
|
||||
params.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true");
|
||||
params.add(CommonParams.Q, "anotheq");
|
||||
|
||||
SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH");
|
||||
SolrQueryResponse rsp = new SolrQueryResponse();
|
||||
rsp.add("responseHeader", new SimpleOrderedMap());
|
||||
SolrQueryRequest req = new LocalSolrQueryRequest(core, params);
|
||||
handler.handleRequest(req, rsp);
|
||||
req.close();
|
||||
NamedList values = rsp.getValues();
|
||||
NamedList spellCheck = (NamedList) values.get("spellcheck");
|
||||
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
|
||||
assertTrue(suggestions.get("suggestion")==null);
|
||||
assertTrue((Boolean) suggestions.get("correctlySpelled")==false);
|
||||
|
||||
params.remove(SpellCheckComponent.SPELLCHECK_DICT);
|
||||
params.add(SpellCheckComponent.SPELLCHECK_DICT, "threshold_direct");
|
||||
rsp = new SolrQueryResponse();
|
||||
rsp.add("responseHeader", new SimpleOrderedMap());
|
||||
req = new LocalSolrQueryRequest(core, params);
|
||||
handler.handleRequest(req, rsp);
|
||||
req.close();
|
||||
values = rsp.getValues();
|
||||
spellCheck = (NamedList) values.get("spellcheck");
|
||||
suggestions = (NamedList) spellCheck.get("suggestions");
|
||||
assertTrue(suggestions.get("suggestion")==null);
|
||||
|
||||
//TODO: Why is DirectSolrSpellChecker returning "true" here? Is that OK?
|
||||
//assertTrue((Boolean) suggestions.get("correctlySpelled")==false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,7 +55,7 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 {
|
|||
NamedList spellchecker = new NamedList();
|
||||
spellchecker.add("classname", DirectSolrSpellChecker.class.getName());
|
||||
spellchecker.add(DirectSolrSpellChecker.FIELD, "teststop");
|
||||
spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, "2"); // we will try "fob"
|
||||
spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, 2); // we will try "fob"
|
||||
|
||||
SolrCore core = h.getCore();
|
||||
checker.init(spellchecker, core);
|
||||
|
|
Loading…
Reference in New Issue