From 6b6e5e65539e476933de3a131ae5160130cda67f Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 7 Jun 2011 02:38:42 +0000 Subject: [PATCH] SOLR-2571: Fix DirectSolrSpellchecker's numeric params to use numeric types, add example and test for thresholdTokenFrequency git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1132855 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 11 +++- solr/example/solr/conf/solrconfig.xml | 3 + .../solr/spelling/DirectSolrSpellChecker.java | 54 ++++++++-------- .../conf/solrconfig-spellcheckcomponent.xml | 15 +++++ .../component/SpellCheckComponentTest.java | 61 +++++++++++++++++++ .../spelling/DirectSolrSpellCheckerTest.java | 2 +- 6 files changed, 115 insertions(+), 31 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 97a6b5a1183..8833db90edd 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -94,9 +94,9 @@ New Features * SOLR-792: Adding PivotFacetComponent for Hierarchical faceting (erik, Jeremy Hinegardner, Thibaut Lassalle, ryan) -* LUCENE-2507: Added DirectSolrSpellChecker, which uses Lucene's DirectSpellChecker +* LUCENE-2507, SOLR-2571: Added DirectSolrSpellChecker, which uses Lucene's DirectSpellChecker to retrieve correction candidates directly from the term dictionary using - levenshtein automata. (rmuir) + levenshtein automata. (James Dyer, rmuir) * SOLR-1873: SolrCloud - added shared/central config and core/shard managment via zookeeper, built-in load balancing, and infrastructure for future SolrCloud work. (yonik, Mark Miller) @@ -282,6 +282,13 @@ Bug Fixes parameter is added to avoid excessive CPU time in extreme cases (e.g. long queries with many misspelled words). (James Dyer via rmuir) +Other Changes +---------------------- + +* SOLR-2571: Add a commented out example of the spellchecker's thresholdTokenFrequency + parameter to the example solrconfig.xml, and also add a unit test for this feature. + (James Dyer via rmuir) + ================== 3.2.0 ================== Versions of Major Components --------------------- diff --git a/solr/example/solr/conf/solrconfig.xml b/solr/example/solr/conf/solrconfig.xml index 4bbc9aa1b14..b1d30b4af86 100755 --- a/solr/example/solr/conf/solrconfig.xml +++ b/solr/example/solr/conf/solrconfig.xml @@ -1084,6 +1084,9 @@ default name spellchecker + diff --git a/solr/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java b/solr/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java index f3b9d52bad8..38cc71e6ba1 100644 --- a/solr/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java +++ b/solr/src/java/org/apache/solr/spelling/DirectSolrSpellChecker.java @@ -59,31 +59,29 @@ import org.slf4j.LoggerFactory; public class DirectSolrSpellChecker extends SolrSpellChecker { private static final Logger LOG = LoggerFactory.getLogger(DirectSolrSpellChecker.class); - /** Field to use as the source of terms */ - public static final String FIELD = "field"; + // configuration params shared with other spellcheckers + public static final String COMPARATOR_CLASS = AbstractLuceneSpellChecker.COMPARATOR_CLASS; + public static final String SCORE_COMP = AbstractLuceneSpellChecker.SCORE_COMP; + public static final String FREQ_COMP = AbstractLuceneSpellChecker.FREQ_COMP; + public static final String FIELD = AbstractLuceneSpellChecker.FIELD; + public static final String STRING_DISTANCE = AbstractLuceneSpellChecker.STRING_DISTANCE; + public static final String ACCURACY = AbstractLuceneSpellChecker.ACCURACY; + public static final String THRESHOLD_TOKEN_FREQUENCY = IndexBasedSpellChecker.THRESHOLD_TOKEN_FREQUENCY; - public static final String STRING_DISTANCE = "distanceMeasure"; public static final String INTERNAL_DISTANCE = "internal"; - - public static final String ACCURACY = "accuracy"; public static final float DEFAULT_ACCURACY = 0.5f; - + public static final float DEFAULT_THRESHOLD_TOKEN_FREQUENCY = 0.0f; + public static final String MAXEDITS = "maxEdits"; public static final int DEFAULT_MAXEDITS = 2; + // params specific to this implementation public static final String MINPREFIX = "minPrefix"; public static final int DEFAULT_MINPREFIX = 1; public static final String MAXINSPECTIONS = "maxInspections"; public static final int DEFAULT_MAXINSPECTIONS = 5; - public static final String COMPARATOR_CLASS = "comparatorClass"; - public static final String SCORE_COMP = "score"; - public static final String FREQ_COMP = "freq"; - - public static final String THRESHOLD = "thresholdTokenFrequency"; - public static final float DEFAULT_THRESHOLD = 0.0f; - public static final String MINQUERYLENGTH = "minQueryLength"; public static final int DEFAULT_MINQUERYLENGTH = 4; @@ -117,39 +115,39 @@ public class DirectSolrSpellChecker extends SolrSpellChecker { field = (String) config.get(FIELD); float minAccuracy = DEFAULT_ACCURACY; - String accuracy = (String) config.get(ACCURACY); + Float accuracy = (Float) config.get(ACCURACY); if (accuracy != null) - minAccuracy = Float.parseFloat(accuracy); + minAccuracy = accuracy; int maxEdits = DEFAULT_MAXEDITS; - String edits = (String) config.get(MAXEDITS); + Integer edits = (Integer) config.get(MAXEDITS); if (edits != null) - maxEdits = Integer.parseInt(edits); + maxEdits = edits; int minPrefix = DEFAULT_MINPREFIX; - String prefix = (String) config.get(MINPREFIX); + Integer prefix = (Integer) config.get(MINPREFIX); if (prefix != null) - minPrefix = Integer.parseInt(prefix); + minPrefix = prefix; int maxInspections = DEFAULT_MAXINSPECTIONS; - String inspections = (String) config.get(MAXINSPECTIONS); + Integer inspections = (Integer) config.get(MAXINSPECTIONS); if (inspections != null) - maxInspections = Integer.parseInt(inspections); + maxInspections = inspections; - float minThreshold = DEFAULT_THRESHOLD; - String threshold = (String) config.get(THRESHOLD); + float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY; + Float threshold = (Float) config.get(THRESHOLD_TOKEN_FREQUENCY); if (threshold != null) - minThreshold = Float.parseFloat(threshold); + minThreshold = threshold; int minQueryLength = DEFAULT_MINQUERYLENGTH; - String queryLength = (String) config.get(MINQUERYLENGTH); + Integer queryLength = (Integer) config.get(MINQUERYLENGTH); if (queryLength != null) - minQueryLength = Integer.parseInt(queryLength); + minQueryLength = queryLength; float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY; - String queryFreq = (String) config.get(MAXQUERYFREQUENCY); + Float queryFreq = (Float) config.get(MAXQUERYFREQUENCY); if (queryFreq != null) - maxQueryFrequency = Float.parseFloat(queryFreq); + maxQueryFrequency = queryFreq; checker.setComparator(comp); checker.setDistance(sd); diff --git a/solr/src/test-files/solr/conf/solrconfig-spellcheckcomponent.xml b/solr/src/test-files/solr/conf/solrconfig-spellcheckcomponent.xml index 7141abf016b..322c938f194 100644 --- a/solr/src/test-files/solr/conf/solrconfig-spellcheckcomponent.xml +++ b/solr/src/test-files/solr/conf/solrconfig-spellcheckcomponent.xml @@ -49,6 +49,21 @@ Config for testing spellcheck component spellchecker1 true + + threshold + lowerfilt + spellcheckerThreshold + true + .29 + + + threshold_direct + solr.DirectSolrSpellChecker + lowerfilt + spellcheckerThreshold + true + .29 + multipleFields lowerfilt1and2 diff --git a/solr/src/test/org/apache/solr/handler/component/SpellCheckComponentTest.java b/solr/src/test/org/apache/solr/handler/component/SpellCheckComponentTest.java index 30972f8b881..a2754f549ee 100644 --- a/solr/src/test/org/apache/solr/handler/component/SpellCheckComponentTest.java +++ b/solr/src/test/org/apache/solr/handler/component/SpellCheckComponentTest.java @@ -21,10 +21,15 @@ import java.io.File; import java.util.*; import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.SpellingParams; import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.core.SolrCore; +import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.request.SolrRequestHandler; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.spelling.AbstractLuceneSpellChecker; import org.junit.BeforeClass; @@ -188,4 +193,60 @@ public class SpellCheckComponentTest extends SolrTestCaseJ4 { assertQ(req, "//arr[@name='suggestion'][.='lucenejava']"); } + + @Test + public void testThresholdTokenFrequency() throws Exception { + + //"document" is in 2 documents but "another" is only in 1. + //So with a threshold of 15%, "another" is absent from the dictionary + //while "document" is present. + + assertJQ(req("qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","documenq", SpellCheckComponent.SPELLCHECK_DICT, "threshold", SpellCheckComponent.SPELLCHECK_COUNT,"5", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true") + ,"/spellcheck/suggestions/[1]/suggestion==[{'word':'document','freq':2}]" + ); + + //TODO: DirectSolrSpellChecker returns a different format. Is this OK? Does SOLRJ need tweaking to handle this??? + assertJQ(req("qt",rh, SpellCheckComponent.COMPONENT_NAME, "true", "q","documenq", SpellCheckComponent.SPELLCHECK_DICT, "threshold_direct", SpellCheckComponent.SPELLCHECK_COUNT,"5", SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true") + ,"/spellcheck/suggestions/[1]/suggestion==['document']]" + ); + + //TODO: how do we make this into a 1-liner using "assertQ()" ??? + SolrCore core = h.getCore(); + SearchComponent speller = core.getSearchComponent("spellcheck"); + assertTrue("speller is null and it shouldn't be", speller != null); + + ModifiableSolrParams params = new ModifiableSolrParams(); + params.add(SpellCheckComponent.COMPONENT_NAME, "true"); + params.add(SpellCheckComponent.SPELLCHECK_COUNT, "10"); + params.add(SpellCheckComponent.SPELLCHECK_DICT, "threshold"); + params.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS,"true"); + params.add(CommonParams.Q, "anotheq"); + + SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH"); + SolrQueryResponse rsp = new SolrQueryResponse(); + rsp.add("responseHeader", new SimpleOrderedMap()); + SolrQueryRequest req = new LocalSolrQueryRequest(core, params); + handler.handleRequest(req, rsp); + req.close(); + NamedList values = rsp.getValues(); + NamedList spellCheck = (NamedList) values.get("spellcheck"); + NamedList suggestions = (NamedList) spellCheck.get("suggestions"); + assertTrue(suggestions.get("suggestion")==null); + assertTrue((Boolean) suggestions.get("correctlySpelled")==false); + + params.remove(SpellCheckComponent.SPELLCHECK_DICT); + params.add(SpellCheckComponent.SPELLCHECK_DICT, "threshold_direct"); + rsp = new SolrQueryResponse(); + rsp.add("responseHeader", new SimpleOrderedMap()); + req = new LocalSolrQueryRequest(core, params); + handler.handleRequest(req, rsp); + req.close(); + values = rsp.getValues(); + spellCheck = (NamedList) values.get("spellcheck"); + suggestions = (NamedList) spellCheck.get("suggestions"); + assertTrue(suggestions.get("suggestion")==null); + + //TODO: Why is DirectSolrSpellChecker returning "true" here? Is that OK? + //assertTrue((Boolean) suggestions.get("correctlySpelled")==false); + } } diff --git a/solr/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java b/solr/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java index 7e93afb8358..e96700f7a00 100644 --- a/solr/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java +++ b/solr/src/test/org/apache/solr/spelling/DirectSolrSpellCheckerTest.java @@ -55,7 +55,7 @@ public class DirectSolrSpellCheckerTest extends SolrTestCaseJ4 { NamedList spellchecker = new NamedList(); spellchecker.add("classname", DirectSolrSpellChecker.class.getName()); spellchecker.add(DirectSolrSpellChecker.FIELD, "teststop"); - spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, "2"); // we will try "fob" + spellchecker.add(DirectSolrSpellChecker.MINQUERYLENGTH, 2); // we will try "fob" SolrCore core = h.getCore(); checker.init(spellchecker, core);