SOLR-395: spell checker upgrade

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@592129 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mike Klaas 2007-11-05 19:39:14 +00:00
parent 009a33ce22
commit 4b3ae817b7
6 changed files with 1028 additions and 16 deletions

View File

@ -136,13 +136,19 @@ New Features
to the detailed field information from the solrj client API.
(Grant Ingersoll via ehatcher)
26. SOLR-334L Pluggable query parsers. Allows specification of query
26. SOLR-334: Pluggable query parsers. Allows specification of query
type and arguments as a prefix on a query string. (yonik)
27. SOLR-351L External Value Source. An external file may be used
27. SOLR-351: External Value Source. An external file may be used
to specify the values of a field, currently usable as
a ValueSource in a FunctionQuery. (yonik)
28. SOLR-395: Many new features for the spell checker implementation, including
an extended response mode with much richer output, multi-word spell checking,
and a bevy of new and renamed options (see the wiki).
(Mike Krimerman, Scott Taber via klaas).
Changes in runtime behavior
Optimizations

View File

@ -18,6 +18,7 @@
package org.apache.solr.handler;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.LuceneDictionary;
@ -30,7 +31,9 @@ import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.util.HiFrequencyDictionary;
import java.io.File;
import java.io.IOException;
@ -42,6 +45,141 @@ import java.util.logging.Logger;
* Takes a string (e.g. a query string) as the value of the "q" parameter
* and looks up alternative spelling suggestions in the spellchecker.
* The spellchecker used by this handler is the Lucene contrib SpellChecker.
*
<style>
pre.code
{
border: 1pt solid #AEBDCC;
background-color: #F3F5F7;
padding: 5pt;
font-family: courier, monospace;
white-space: pre;
// begin css 3 or browser specific rules - do not remove!
//see: http://forums.techguy.org/archive/index.php/t-249849.html
white-space: pre-wrap;
word-wrap: break-word;
white-space: -moz-pre-wrap;
white-space: -pre-wrap;
white-space: -o-pre-wrap;
// end css 3 or browser specific rules
}
</style>
*
* <p>The results identifies the original words echoing it as an entry with the
* name of "words" and original word value. It
* also identifies if the requested "words" is contained in the index through
* the use of the exist true/false name value. Examples of these output
* parameters in the standard output format is as follows:</p>
* <pre class="code">
&lt;str name="words"&gt;facial&lt;/str&gt;
&lt;str name="exist"&gt;true&lt;/str&gt; </pre>
*
* <p>If a query string parameter of "multiWords" is used, then each word within the
* "q" parameter (seperated by a space or +) will
* be iterated through the spell checker and will be wrapped in an
* NamedList. Each word will then get its own set of results: words, exists, and
* suggestions.</p>
*
* <p>Examples of the use of the standard ouput (XML) without and with the
* use of the "multiWords" parameter are as follows.</p>
*
* <p> The following URL
* examples were configured with the solr.SpellCheckerRequestHandler
* named as "/spellchecker".</p>
*
* <p>Without the use of "extendedResults" and one word
* spelled correctly: facial </p>
* <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial</pre>
* <pre class="code">
&lt;?xml version="1.0" encoding="UTF-8"?&gt;
&lt;response&gt;
&lt;lst name="responseHeader"&gt;
&lt;int name="status"&gt;0&lt;/int&gt;
&lt;int name="QTime"&gt;6&lt;/int&gt;
&lt;/lst&gt;
&lt;str name="words"&gt;facial&lt;/str&gt;
&lt;str name="exist"&gt;true&lt;/str&gt;
&lt;arr name="suggestions"&gt;
&lt;str&gt;faciale&lt;/str&gt;
&lt;str&gt;faucial&lt;/str&gt;
&lt;str&gt;fascial&lt;/str&gt;
&lt;str&gt;facing&lt;/str&gt;
&lt;str&gt;faciei&lt;/str&gt;
&lt;str&gt;facialis&lt;/str&gt;
&lt;str&gt;social&lt;/str&gt;
&lt;str&gt;facile&lt;/str&gt;
&lt;str&gt;spacial&lt;/str&gt;
&lt;str&gt;glacial&lt;/str&gt;
&lt;str&gt;marcial&lt;/str&gt;
&lt;str&gt;facies&lt;/str&gt;
&lt;str&gt;facio&lt;/str&gt;
&lt;/arr&gt;
&lt;/response&gt; </pre>
*
* <p>Without the use of "extendedResults" and two words,
* one spelled correctly and one misspelled: facial salophosphoprotein </p>
* <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial+salophosphoprotein</pre>
* <pre class="code">
&lt;?xml version="1.0" encoding="UTF-8"?&gt;
&lt;response&gt;
&lt;lst name="responseHeader"&gt;
&lt;int name="status"&gt;0&lt;/int&gt;
&lt;int name="QTime"&gt;18&lt;/int&gt;
&lt;/lst&gt;
&lt;str name="words"&gt;facial salophosphoprotein&lt;/str&gt;
&lt;str name="exist"&gt;false&lt;/str&gt;
&lt;arr name="suggestions"&gt;
&lt;str&gt;sialophosphoprotein&lt;/str&gt;
&lt;/arr&gt;
&lt;/response&gt; </pre>
*
*
* <p>With the use of "extendedResults" and two words,
* one spelled correctly and one misspelled: facial salophosphoprotein </p>
* <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&extendedResults=true&q=facial+salophosphoprotein</pre>
* <pre class="code">
&lt;?xml version="1.0" encoding="UTF-8"?&gt;
&lt;response&gt;
&lt;lst name="responseHeader"&gt;
&lt;int name="status"&gt;0&lt;/int&gt;
&lt;int name="QTime"&gt;23&lt;/int&gt;
&lt;/lst&gt;
&lt;lst name="result"&gt;
&lt;lst name="facial"&gt;
&lt;int name="frequency"&gt;1&lt;/int&gt;
&lt;lst name="suggestions"&gt;
&lt;lst name="faciale"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="faucial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="fascial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="facing"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="faciei"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="facialis"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="social"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="facile"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="spacial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="glacial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="marcial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="facies"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="facio"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;/lst&gt;
&lt;/lst&gt;
&lt;lst name="salophosphoprotein"&gt;
&lt;int name="frequency"&gt;0&lt;/int&gt;
&lt;lst name="suggestions"&gt;
&lt;lst name="sialophosphoprotein"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="phosphoprotein"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="phosphoproteins"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;lst name="alphalipoprotein"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
&lt;/lst&gt;
&lt;/lst&gt;
&lt;/lst&gt;
&lt;/response&gt; </pre>
*
* @see <a href="http://wiki.apache.org/jakarta-lucene/SpellChecker">The Lucene Spellchecker documentation</a>
*
*/
@ -64,22 +202,37 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {
* return only the words more frequent than this.
*
*/
private boolean onlyMorePopular = false;
private Directory spellcheckerIndexDir = new RAMDirectory();
private String dirDescription = "(ramdir)";
private String termSourceField;
private static final String PREFIX = "sp.";
private static final String QUERY_PREFIX = PREFIX + "query.";
private static final String DICTIONARY_PREFIX = PREFIX + "dictionary.";
private static final String SOURCE_FIELD = DICTIONARY_PREFIX + "termSourceField";
private static final String INDEX_DIR = DICTIONARY_PREFIX + "indexDir";
private static final String THRESHOLD = DICTIONARY_PREFIX + "threshold";
private static final String ACCURACY = QUERY_PREFIX + "accuracy";
private static final String SUGGESTIONS = QUERY_PREFIX + "suggestionCount";
private static final String POPULAR = QUERY_PREFIX + "onlyMorePopular";
private static final String EXTENDED = QUERY_PREFIX + "extendedResults";
private static final float DEFAULT_ACCURACY = 0.5f;
private static final int DEFAULT_NUM_SUGGESTIONS = 1;
private static final int DEFAULT_SUGGESTION_COUNT = 1;
private static final boolean DEFAULT_MORE_POPULAR = false;
private static final boolean DEFAULT_EXTENDED_RESULTS = false;
private static final float DEFAULT_DICTIONARY_THRESHOLD = 0.0f;
public void init(NamedList args) {
super.init(args);
SolrParams p = SolrParams.toSolrParams(args);
termSourceField = p.get("termSourceField");
termSourceField = p.get(SOURCE_FIELD, p.get("termSourceField"));
try {
String dir = p.get("spellcheckerIndexDir");
String dir = p.get(INDEX_DIR, p.get("spellcheckerIndexDir"));
if (null != dir) {
File f = new File(dir);
if ( ! f.isAbsolute() ) {
@ -97,6 +250,10 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {
}
}
/**
* Processes the following query string parameters: q, multiWords, cmd rebuild,
* cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular.
*/
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
throws Exception {
SolrParams p = req.getParams();
@ -115,36 +272,78 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {
}
}
// empty query string
if (null == words || "".equals(words.trim())) {
return;
}
IndexReader indexReader = null;
String suggestionField = null;
Float accuracy;
int numSug;
boolean onlyMorePopular;
boolean extendedResults;
try {
accuracy = p.getFloat("accuracy", DEFAULT_ACCURACY);
accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY));
spellChecker.setAccuracy(accuracy);
} catch (NumberFormatException e) {
throw new RuntimeException("Accuracy must be a valid positive float", e);
}
try {
numSug = p.getInt("suggestionCount", DEFAULT_NUM_SUGGESTIONS);
numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT));
} catch (NumberFormatException e) {
throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e);
}
try {
onlyMorePopular = p.getBool("onlyMorePopular", DEFAULT_MORE_POPULAR);
} catch (NumberFormatException e) {
onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR);
} catch (SolrException e) {
throw new RuntimeException("'Only more popular' must be a valid boolean", e);
}
try {
extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS);
} catch (SolrException e) {
throw new RuntimeException("'Extended results' must be a valid boolean", e);
}
// when searching for more popular, a non null index-reader and
// when searching for more popular, a non null index-reader and
// restricted-field are required
if (onlyMorePopular) {
if (onlyMorePopular || extendedResults) {
indexReader = req.getSearcher().getReader();
suggestionField = termSourceField;
}
if (extendedResults) {
if (null != words && !"".equals(words.trim())) {
SimpleOrderedMap<Object> results = new SimpleOrderedMap<Object>();
String[] wordz = words.split(" ");
for (String word : wordz)
{
SimpleOrderedMap<Object> nl = new SimpleOrderedMap<Object>();
nl.add("frequency", indexReader.docFreq(new Term(suggestionField, word)));
String[] suggestions =
spellChecker.suggestSimilar(word, numSug,
indexReader, suggestionField, onlyMorePopular);
// suggestion array
NamedList<Object> sa = new NamedList<Object>();
for (int i=0; i<suggestions.length; i++) {
// suggestion item
SimpleOrderedMap<Object> si = new SimpleOrderedMap<Object>();
si.add("frequency", indexReader.docFreq(new Term(termSourceField, suggestions[i])));
sa.add(suggestions[i], si);
}
nl.add("suggestions", sa);
results.add(word, nl);
}
rsp.add( "result", results );
} else {
rsp.add("words", words);
if (spellChecker.exist(words)) {
rsp.add("exist","true");
} else {
rsp.add("exist","false");
}
String[] suggestions =
spellChecker.suggestSimilar(words, numSug,
indexReader, suggestionField,
@ -156,6 +355,7 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {
/** Rebuilds the SpellChecker index using values from the <code>termSourceField</code> from the
* index pointed to by the current {@link IndexSearcher}.
* Any word appearing in less that thresh documents will not be added to the spellcheck index.
*/
private void rebuild(SolrQueryRequest req) throws IOException, SolrException {
if (null == termSourceField) {
@ -163,8 +363,15 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {
(SolrException.ErrorCode.SERVER_ERROR, "can't rebuild spellchecker index without termSourceField configured");
}
Float threshold;
try {
threshold = req.getParams().getFloat("sp.dictionary.threshold", DEFAULT_DICTIONARY_THRESHOLD);
} catch (NumberFormatException e) {
throw new RuntimeException("Threshold must be a valid positive float", e);
}
IndexReader indexReader = req.getSearcher().getReader();
Dictionary dictionary = new LuceneDictionary(indexReader, termSourceField);
Dictionary dictionary = new HiFrequencyDictionary(indexReader, termSourceField, threshold);
spellChecker.clearIndex();
spellChecker.indexDictionary(dictionary);
reopen();

View File

@ -0,0 +1,140 @@
package org.apache.solr.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.spell.Dictionary;
/**
* Hi Frequency Dictionary: terms taken from the given field
* of a Lucene index, which appear in a number of documents
* above a given threshold.
*
* When using IndexReader.terms(Term) the code must not call next() on TermEnum
* as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
*
* Threshold is a value in [0..1] representing the minimum
* number of documents (of the total) where a term should appear.
*
* @author Mike Krimerman
*
* Based on LuceneDictionary, by
* @author Nicolas Maisonneuve
* @author Christian Mallwitz
*/
public class HiFrequencyDictionary implements Dictionary {
private IndexReader reader;
private String field;
private float thresh;
public HiFrequencyDictionary(IndexReader reader, String field, float thresh) {
this.reader = reader;
this.field = field.intern();
this.thresh = thresh;
}
public final Iterator getWordsIterator() {
return new HiFrequencyIterator();
}
final class HiFrequencyIterator implements Iterator {
private TermEnum termEnum;
private Term actualTerm;
private boolean hasNextCalled;
private int minNumDocs;
HiFrequencyIterator() {
try {
termEnum = reader.terms(new Term(field, ""));
minNumDocs = (int)(thresh * (float)reader.numDocs());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private boolean isFrequent(Term term) {
try {
return reader.docFreq(term) >= minNumDocs;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public Object next() {
if (!hasNextCalled) {
hasNext();
}
hasNextCalled = false;
try {
termEnum.next();
} catch (IOException e) {
throw new RuntimeException(e);
}
return (actualTerm != null) ? actualTerm.text() : null;
}
public boolean hasNext() {
if (hasNextCalled) {
return actualTerm != null;
}
hasNextCalled = true;
do {
actualTerm = termEnum.term();
// if there are no words return false
if (actualTerm == null) {
return false;
}
String currentField = actualTerm.field();
// if the next word doesn't have the same field return false
if (currentField != field) {
actualTerm = null;
return false;
}
// got a valid term, does it pass the threshold?
if (isFrequent(actualTerm)) {
return true;
}
// term not up to threshold
try {
termEnum.next();
} catch (IOException e) {
throw new RuntimeException(e);
}
} while (true);
}
public void remove() {
throw new UnsupportedOperationException();
}
}
}

View File

@ -0,0 +1,473 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler;
import org.apache.solr.util.AbstractSolrTestCase;
/**
* This is a test case to test the SpellCheckerRequestHandler class.
* It tests:
* <ul>
* <li>The generation of the spell checkers list with a 10 words</li>
* <li>The identification of the word that was being spell checked</li>
* <li>The confirmation if the word exists or not in the index</li>
* <li>The suggested list of a correctly and incorrectly spelled words</li>
* <li>The suggestions for both correct and incorrect words</li>
* <li>The limitation on the number of suggestions with the
* suggestionCount parameter</li>
* <li>The usage of the parameter multiWords</li>
* </ul>
*
* Notes/Concerns about this Test Case:
* <ul>
* <li>This is my first test case for a Solr Handler. As such I am not
* familiar with the AbstractSolrTestCase and as such I am not
* 100% these test cases will work under the same for each person
* who runs the test cases (see next note).</li>
* <li>The order of the arrays (arr) may not be consistant on other
* systems or different runs, as such these test cases may fail?</li>
* <li>Note: I changed //arr/str[1][.='cart'] to //arr/str[.='cart'] and it
* appears to work.</li>
* <li>The two notations appear to successfully test for the same thing:
* "//lst[@name='result']/lst[1][@name='word']/str[@name='words'][.='cat']"
* and "//str[@name='words'][.='cat']" which I would think // would indicate
* a root node.</li>
* </ul>
*/
public class SpellCheckerRequestHandlerTest
extends AbstractSolrTestCase
{
@Override
public String getSchemaFile() { return "solr/conf/schema-spellchecker.xml"; }
@Override
public String getSolrConfigFile() { return "solr/conf/solrconfig-spellchecker.xml"; }
@Override
public void setUp() throws Exception {
super.setUp();
}
private void buildSpellCheckIndex()
{
lrf = h.getRequestFactory("spellchecker", 0, 20 );
lrf.args.put("version","2.0");
lrf.args.put("sp.query.accuracy",".9");
assertU("Add some words to the Spell Check Index:",
adoc("id", "100",
"spell", "solr"));
assertU(adoc("id", "101",
"spell", "cat"));
assertU(adoc("id", "102",
"spell", "cart"));
assertU(adoc("id", "103",
"spell", "carp"));
assertU(adoc("id", "104",
"spell", "cant"));
assertU(adoc("id", "105",
"spell", "catnip"));
assertU(adoc("id", "106",
"spell", "cattails"));
assertU(adoc("id", "107",
"spell", "cod"));
assertU(adoc("id", "108",
"spell", "corn"));
assertU(adoc("id", "109",
"spell", "cot"));
assertU(commit());
assertU(optimize());
lrf.args.put("cmd","rebuild");
assertQ("Need to first build the index:",
req("cat")
,"//str[@name='cmdExecuted'][.='rebuild']"
,"//str[@name='words'][.='cat']"
,"//str[@name='exist'][.='true']"
// ,"//arr[@name='suggestions'][.='']"
);
lrf.args.clear();
}
/**
* Test for correct spelling of a single word at various accuracy levels
* to see how the suggestions vary.
*/
public void testSpellCheck_01_correctWords() {
buildSpellCheckIndex();
lrf = h.getRequestFactory("spellchecker", 0, 20 );
lrf.args.put("version","2.0");
lrf.args.put("sp.query.accuracy",".9");
assertQ("Failed to spell check",
req("cat")
,"//str[@name='words'][.='cat']"
,"//str[@name='exist'][.='true']"
);
lrf.args.put("sp.query.accuracy",".4");
assertQ("Failed to spell check",
req("cat")
,"//str[@name='words'][.='cat']"
,"//str[@name='exist'][.='true']"
,"//arr/str[.='cot']"
,"//arr/str[.='cart']"
// ,"//arr/str[1][.='cot']"
// ,"//arr/str[2][.='cart']"
);
lrf.args.put("sp.query.accuracy",".0");
assertQ("Failed to spell check",
req("cat")
,"//str[@name='words'][.='cat']"
,"//str[@name='exist'][.='true']"
,"//arr/str[.='cart']"
,"//arr/str[.='cot']"
,"//arr/str[.='carp']"
,"//arr/str[.='cod']"
,"//arr/str[.='corn']"
);
}
/**
* Test for correct spelling of a single word at various accuracy levels
* to see how the suggestions vary.
*/
public void testSpellCheck_02_incorrectWords() {
buildSpellCheckIndex();
lrf = h.getRequestFactory("spellchecker", 0, 20 );
lrf.args.put("version","2.0");
lrf.args.put("sp.query.accuracy",".9");
assertQ("Confirm the index is still valid",
req("cat")
,"//str[@name='words'][.='cat']"
,"//str[@name='exist'][.='true']"
);
assertQ("Failed to spell check",
req("coat")
,"//str[@name='words'][.='coat']"
,"//str[@name='exist'][.='false']"
,"//arr[@name='suggestions'][.='']"
);
lrf.args.put("sp.query.accuracy",".2");
assertQ("Failed to spell check",
req("coat")
,"//str[@name='words'][.='coat']"
,"//str[@name='exist'][.='false']"
,"//arr/str[.='cot']"
,"//arr/str[.='cat']"
,"//arr/str[.='corn']"
,"//arr/str[.='cart']"
,"//arr/str[.='cod']"
,"//arr/str[.='solr']"
,"//arr/str[.='carp']"
);
lrf.args.put("sp.query.suggestionCount", "2");
lrf.args.put("sp.query.accuracy",".2");
assertQ("Failed to spell check",
req("coat")
,"//str[@name='words'][.='coat']"
,"//str[@name='exist'][.='false']"
,"//arr/str[.='cot']"
,"//arr/str[.='cat']"
);
}
/**
* Test for correct spelling of a single word at various accuracy levels
* to see how the suggestions vary.
*/
public void testSpellCheck_03_multiWords_correctWords() {
buildSpellCheckIndex();
lrf = h.getRequestFactory("spellchecker", 0, 20 );
lrf.args.put("version","2.0");
lrf.args.put("sp.query.accuracy",".9");
assertQ("Confirm the index is still valid",
req("cat")
,"//str[@name='words'][.='cat']"
,"//str[@name='exist'][.='true']"
);
// Enable multiWords formatting:
lrf.args.put("sp.query.extendedResults", "true");
assertQ("Failed to spell check",
req("cat")
,"//lst[@name='cat']"
,"//lst[@name='cat']/int[@name='frequency'][.>0]"
,"//lst[@name='cat']/lst[@name='suggestions' and count(lst)=0]"
);
// Please note that the following produces the following XML structure.
// <response>
// <responseHeader>
// <status>0</status><QTime>0</QTime>
// </responseHeader>
// <lst name="result">
// <lst name="cat">
// <int name="frequency">1</int>
// <lst name="suggestions">
// <lst name="cart"><int name="frequency">1</int></lst>
// <lst name="cot"><int name="frequency">1</int></lst>
// <lst name="cod"><int name="frequency">1</int></lst>
// <lst name="carp"><int name="frequency">1</int></lst>
// </lst>
// </lst>
// </lst>
// </response>
lrf.args.put("sp.query.accuracy",".2");
assertQ("Failed to spell check",
req("cat")
,"//lst[@name='cat']"
,"//lst[@name='cat']/int[@name='frequency'][.>0]"
,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cart']/int[@name='frequency'][.>0]"
,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cot']/int[@name='frequency'][.>0]"
,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cod']/int[@name='frequency'][.>0]"
,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='carp']/int[@name='frequency'][.>0]"
);
lrf.args.put("sp.query.suggestionCount", "2");
lrf.args.put("sp.query.accuracy",".2");
assertQ("Failed to spell check",
req("cat")
,"//lst[@name='cat']"
,"//lst[@name='cat']/int[@name='frequency'][.>0]"
,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cart']"
,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cot']"
);
/* The following is the generated XML response for the next query with three words:
<response>
<responseHeader><status>0</status><QTime>0</QTime></responseHeader>
<lst name="result">
<lst name="cat">
<int name="frequency">1</int>
<lst name="suggestions">
<lst name="cart"><int name="frequency">1</int></lst>
<lst name="cot"><int name="frequency">1</int></lst>
</lst>
</lst>
<lst name="card">
<int name="frequency">1</int>
<lst name="suggestions">
<lst name="carp"><int name="frequency">1</int></lst>
<lst name="cat"><int name="frequency">1</int></lst>
</lst>
</lst>
<lst name="carp">
<int name="frequency">1</int>
<lst name="suggestions">
<lst name="cart"><int name="frequency">1</int></lst>
<lst name="corn"><int name="frequency">1</int></lst>
</lst>
</lst>
</lst>
</response>
*/
lrf.args.put("sp.query.suggestionCount", "2");
lrf.args.put("sp.query.accuracy",".2");
assertQ("Failed to spell check",
req("cat cart carp")
,"//lst[@name='cat']"
,"//lst[@name='cat']/int[@name='frequency'][.>0]"
,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cart']"
,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cot']"
,"//lst[@name='cart']"
,"//lst[@name='cart']/int[@name='frequency'][.>0]"
,"//lst[@name='cart']/lst/lst[1]"
,"//lst[@name='cart']/lst/lst[2]"
,"//lst[@name='carp']"
,"//lst[@name='carp']/int[@name='frequency'][.>0]"
,"//lst[@name='carp']/lst[@name='suggestions']/lst[@name='cart']"
,"//lst[@name='carp']/lst[@name='suggestions']/lst[@name='corn']"
);
}
/**
* Test for correct spelling of a single word at various accuracy levels
* to see how the suggestions vary.
*/
public void testSpellCheck_04_multiWords_incorrectWords() {
buildSpellCheckIndex();
lrf = h.getRequestFactory("spellchecker", 0, 20 );
lrf.args.put("version","2.0");
lrf.args.put("sp.query.accuracy",".9");
assertQ("Confirm the index is still valid",
req("cat")
,"//str[@name='words'][.='cat']"
,"//str[@name='exist'][.='true']"
);
// Enable multiWords formatting:
lrf.args.put("sp.query.extendedResults", "true");
assertQ("Failed to spell check",
req("coat")
,"//lst[@name='coat']"
,"//lst[@name='coat']/int[@name='frequency'][.=0]"
,"//lst[@name='coat']/lst[@name='suggestions' and count(lst)=0]"
);
lrf.args.put("sp.query.accuracy",".2");
assertQ("Failed to spell check",
req("coat")
,"//lst[@name='coat']"
,"//lst[@name='coat']/int[@name='frequency'][.=0]"
,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cot']"
,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cat']"
,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='corn']"
,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cart']"
);
lrf.args.put("sp.query.suggestionCount", "2");
lrf.args.put("sp.query.accuracy",".2");
assertQ("Failed to spell check",
req("coat")
,"//lst[@name='coat']"
,"//lst[@name='coat']/int[@name='frequency'][.=0]"
,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cot']"
,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cat']"
);
lrf.args.put("sp.query.suggestionCount", "2");
lrf.args.put("sp.query.accuracy",".2");
assertQ("Failed to spell check",
req("cet cert corp")
,"//lst[@name='cet']"
,"//lst[@name='cet']/int[@name='frequency'][.=0]"
,"//lst[@name='cet']/lst[@name='suggestions']/lst[1]"
,"//lst[@name='cet']/lst[@name='suggestions']/lst[2]"
,"//lst[@name='cert']"
,"//lst[@name='cert']/int[@name='frequency'][.=0]"
,"//lst[@name='cert']/lst[@name='suggestions']/lst[1]"
,"//lst[@name='cert']/lst[@name='suggestions']/lst[2]"
,"//lst[@name='corp']"
,"//lst[@name='corp']/int[@name='frequency'][.=0]"
,"//lst[@name='corp']/lst[@name='suggestions']/lst[1]"
,"//lst[@name='corp']/lst[@name='suggestions']/lst[2]"
);
}
public void testSpellCheck_05_buildDictionary() {
lrf = h.getRequestFactory("spellchecker", 0, 20 );
lrf.args.put("version","2.0");
lrf.args.put("sp.query.accuracy",".9");
assertU("Add some words to the Spell Check Index:",
adoc("id", "100",
"spell", "solr cat cart"));
assertU(adoc("id", "101",
"spell", "cat cart"));
assertU(adoc("id", "102",
"spell", "cat cart"));
assertU(adoc("id", "103",
"spell", "cat cart carp"));
assertU(adoc("id", "104",
"spell", "cat car cant"));
assertU(adoc("id", "105",
"spell", "cat catnip"));
assertU(adoc("id", "106",
"spell", "cat cattails"));
assertU(adoc("id", "107",
"spell", "cat cod"));
assertU(adoc("id", "108",
"spell", "cat corn"));
assertU(adoc("id", "109",
"spell", "cat cot"));
assertU(commit());
assertU(optimize());
lrf.args.put("sp.dictionary.threshold", "0.20");
lrf.args.put("cmd","rebuild");
assertQ("Need to first build the index:",
req("cat")
,"//str[@name='cmdExecuted'][.='rebuild']"
,"//str[@name='words'][.='cat']"
,"//str[@name='exist'][.='true']"
);
lrf.args.clear();
lrf.args.put("version","2.0");
lrf.args.put("sp.query.accuracy",".9");
assertQ("Confirm index contains only words above threshold",
req("cat")
,"//str[@name='words'][.='cat']"
,"//str[@name='exist'][.='true']"
);
assertQ("Confirm index contains only words above threshold",
req("cart")
,"//str[@name='words'][.='cart']"
,"//str[@name='exist'][.='true']"
);
assertQ("Confirm index contains only words above threshold",
req("cod")
,"//str[@name='words'][.='cod']"
,"//str[@name='exist'][.='false']"
);
assertQ("Confirm index contains only words above threshold",
req("corn")
,"//str[@name='words'][.='corn']"
,"//str[@name='exist'][.='false']"
);
lrf.args.clear();
}
}

View File

@ -0,0 +1,83 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- This is the Solr schema file. This file should be named "schema.xml" and
should be in the conf directory under the solr home
(i.e. ./solr/conf/schema.xml by default)
or located where the classloader for the Solr webapp can find it.
For more information, on how to customize this file, please see
http://wiki.apache.org/solr/SchemaXml
-->
<schema name="Solr SpellCheck Test" version="1.1">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.1" is Solr's version number for the schema syntax and semantics. It should
not normally be changed by applications.
1.0: multiValued attribute did not exist, all fields are multiValued by nature
1.1: multiValued attribute introduced, false by default -->
<types>
<fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory"/>
<filter class="solr.EnglishPorterFilterFactory"/>
</analyzer>
</fieldtype>
<fieldType name="spellText" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
</types>
<fields>
<field name="id" type="string" indexed="true" stored="true"/>
<field name="spell" type="spellText" indexed="true" stored="true" />
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
</fields>
<!-- field to use to determine and enforce document uniqueness. -->
<uniqueKey>id</uniqueKey>
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<defaultSearchField>text</defaultSearchField>
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="OR"/>
</schema>

View File

@ -0,0 +1,103 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<config>
<indexDefaults>
<useCompoundFile>false</useCompoundFile>
<mergeFactor>10</mergeFactor>
<maxBufferedDocs>1000</maxBufferedDocs>
<maxMergeDocs>2147483647</maxMergeDocs>
<maxFieldLength>10000</maxFieldLength>
<writeLockTimeout>1000</writeLockTimeout>
<commitLockTimeout>10000</commitLockTimeout>
</indexDefaults>
<mainIndex>
<useCompoundFile>false</useCompoundFile>
<mergeFactor>10</mergeFactor>
<maxBufferedDocs>1000</maxBufferedDocs>
<maxMergeDocs>2147483647</maxMergeDocs>
<maxFieldLength>10000</maxFieldLength>
<unlockOnStartup>true</unlockOnStartup>
</mainIndex>
<updateHandler class="solr.DirectUpdateHandler2">
<commitIntervalLowerBound>0</commitIntervalLowerBound>
</updateHandler>
<query>
<maxBooleanClauses>1024</maxBooleanClauses>
<useFilterForSortedQuery>true</useFilterForSortedQuery>
<queryResultWindowSize>10</queryResultWindowSize>
<HashDocSet maxSize="3000" loadFactor="0.75"/>
<boolTofilterOptimizer enabled="true" cacheSize="32" threshold=".05"/>
</query>
<requestHandler name="standard" class="solr.StandardRequestHandler" />
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />
<!-- SpellCheckerRequestHandler takes in a word (or several words) as the
value of the "q" parameter and returns a list of alternative spelling
suggestions. If invoked with a ...&cmd=rebuild, it will rebuild the
spellchecker index.
-->
<requestHandler name="spellchecker" class="solr.SpellCheckerRequestHandler" startup="lazy">
<!-- default values for query parameters -->
<lst name="defaults">
<int name="sp.query.suggestionCount">20</int>
<float name="sp.query.accuracy">0.60</float>
</lst>
<!-- Main init params for handler -->
<!-- The directory where your SpellChecker Index should live. -->
<!-- May be absolute, or relative to the Solr "dataDir" directory. -->
<!-- If this option is not specified, a RAM directory will be used -->
<str name="sp.dictionary.spellcheckerIndexDir">spell</str>
<!-- the field in your schema that you want to be able to build -->
<!-- your spell index on. This should be a field that uses a very -->
<!-- simple FieldType without a lot of Analysis (ie: string) -->
<str name="sp.dictionary.termSourceField">spell</str>
<!-- threshold for word to make it into the dictionary -->
<!-- a word should appear at minimum in the specified precent of documents -->
<str name="sp.dictionary.threshold">0.0</str>
</requestHandler>
<queryResponseWriter name="standard" class="org.apache.solr.request.XMLResponseWriter"/>
<queryResponseWriter name="useless" class="org.apache.solr.OutputWriterTest$UselessOutputWriter"/>
<queryResponseWriter name="xslt" class="org.apache.solr.request.XSLTResponseWriter"/>
<queryResponseWriter name="json" class="org.apache.solr.request.JSONResponseWriter"/>
<!-- config for the admin interface -->
<admin>
<defaultQuery>solr</defaultQuery>
<gettableFiles>solrconfig.xml schema.xml admin-extra.html</gettableFiles>
</admin>
</config>