SOLR-395: spell checker upgrade

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@592129 13f79535-47bb-0310-9956-ffa450edef68
2007-11-05 19:39:14 +00:00 · 2007-11-05 19:39:14 +00:00 · 4b3ae817b7
parent 009a33ce22
commit 4b3ae817b7
6 changed files with 1028 additions and 16 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -136,13 +136,19 @@ New Features
    to the detailed field information from the solrj client API.
    (Grant Ingersoll via ehatcher)

-26. SOLR-334L Pluggable query parsers.  Allows specification of query
+26. SOLR-334: Pluggable query parsers.  Allows specification of query
    type and arguments as a prefix on a query string. (yonik)

-27. SOLR-351L External Value Source.  An external file may be used
+27. SOLR-351: External Value Source.  An external file may be used
     to specify the values of a field, currently usable as
     a ValueSource in a FunctionQuery. (yonik)

+28. SOLR-395: Many new features for the spell checker implementation, including
+    an extended response mode with much richer output, multi-word spell checking,
+    and a bevy of new and renamed options (see the wiki).
+    (Mike Krimerman, Scott Taber via klaas).
+
+
 Changes in runtime behavior

 Optimizations
--- a/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java
+++ b/src/java/org/apache/solr/handler/SpellCheckerRequestHandler.java
@ -18,6 +18,7 @@
 package org.apache.solr.handler;

 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.spell.Dictionary;
 import org.apache.lucene.search.spell.LuceneDictionary;
@ -30,7 +31,9 @@ import org.apache.solr.request.SolrQueryResponse;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.SimpleOrderedMap;
 import org.apache.solr.core.SolrCore;
+import org.apache.solr.util.HiFrequencyDictionary;

 import java.io.File;
 import java.io.IOException;
@ -42,6 +45,141 @@ import java.util.logging.Logger;
 * Takes a string (e.g. a query string) as the value of the "q" parameter
 * and looks up alternative spelling suggestions in the spellchecker.
 * The spellchecker used by this handler is the Lucene contrib SpellChecker.
+ * 
+<style>
+pre.code
+{
+  border: 1pt solid #AEBDCC;
+  background-color: #F3F5F7;
+  padding: 5pt;
+  font-family: courier, monospace;
+  white-space: pre;
+  // begin css 3 or browser specific rules - do not remove!
+  //see: http://forums.techguy.org/archive/index.php/t-249849.html 
+    white-space: pre-wrap;
+    word-wrap: break-word;
+    white-space: -moz-pre-wrap;
+    white-space: -pre-wrap;
+    white-space: -o-pre-wrap;
+   // end css 3 or browser specific rules
+}
+
+</style>
+ * 
+ * <p>The results identifies the original words echoing it as an entry with the 
+ * name of "words" and original word value.  It 
+ * also identifies if the requested "words" is contained in the index through 
+ * the use of the exist true/false name value. Examples of these output 
+ * parameters in the standard output format is as follows:</p>
+ * <pre class="code">
+&lt;str name="words"&gt;facial&lt;/str&gt;
+&lt;str name="exist"&gt;true&lt;/str&gt; </pre>
+ * 
+ * <p>If a query string parameter of "multiWords" is used, then each word within the
+ * "q" parameter (seperated by a space or +) will 
+ * be iterated through the spell checker and will be wrapped in an 
+ * NamedList.  Each word will then get its own set of results: words, exists, and
+ * suggestions.</p>
+ * 
+ * <p>Examples of the use of the standard ouput (XML) without and with the 
+ * use of the "multiWords" parameter are as follows.</p>
+ * 
+ * <p> The following URL
+ * examples were configured with the solr.SpellCheckerRequestHandler 
+ * named as "/spellchecker".</p>
+ * 
+ * <p>Without the use of "extendedResults" and one word 
+ * spelled correctly: facial </p>
+ * <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial</pre>
+ * <pre class="code">
+&lt;?xml version="1.0" encoding="UTF-8"?&gt;
+&lt;response&gt;
+
+&lt;lst name="responseHeader"&gt;
+   &lt;int name="status"&gt;0&lt;/int&gt;
+   &lt;int name="QTime"&gt;6&lt;/int&gt;
+&lt;/lst&gt;
+&lt;str name="words"&gt;facial&lt;/str&gt;
+&lt;str name="exist"&gt;true&lt;/str&gt;
+&lt;arr name="suggestions"&gt;
+   &lt;str&gt;faciale&lt;/str&gt;
+   &lt;str&gt;faucial&lt;/str&gt;
+   &lt;str&gt;fascial&lt;/str&gt;
+   &lt;str&gt;facing&lt;/str&gt;
+   &lt;str&gt;faciei&lt;/str&gt;
+   &lt;str&gt;facialis&lt;/str&gt;
+   &lt;str&gt;social&lt;/str&gt;
+   &lt;str&gt;facile&lt;/str&gt;
+   &lt;str&gt;spacial&lt;/str&gt;
+   &lt;str&gt;glacial&lt;/str&gt;
+   &lt;str&gt;marcial&lt;/str&gt;
+   &lt;str&gt;facies&lt;/str&gt;
+   &lt;str&gt;facio&lt;/str&gt;
+&lt;/arr&gt;
+&lt;/response&gt;   </pre>
+ * 
+ * <p>Without the use of "extendedResults" and two words,  
+ * one spelled correctly and one misspelled: facial salophosphoprotein </p>
+ * <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&q=facial+salophosphoprotein</pre>
+ * <pre class="code">
+&lt;?xml version="1.0" encoding="UTF-8"?&gt;
+&lt;response&gt;
+
+&lt;lst name="responseHeader"&gt;
+   &lt;int name="status"&gt;0&lt;/int&gt;
+   &lt;int name="QTime"&gt;18&lt;/int&gt;
+&lt;/lst&gt;
+&lt;str name="words"&gt;facial salophosphoprotein&lt;/str&gt;
+&lt;str name="exist"&gt;false&lt;/str&gt;
+&lt;arr name="suggestions"&gt;
+   &lt;str&gt;sialophosphoprotein&lt;/str&gt;
+&lt;/arr&gt;
+&lt;/response&gt;  </pre>
+ * 
+ * 
+ * <p>With the use of "extendedResults" and two words,  
+ * one spelled correctly and one misspelled: facial salophosphoprotein </p>
+ * <pre class="code">http://.../spellchecker?indent=on&onlyMorePopular=true&accuracy=.6&suggestionCount=20&extendedResults=true&q=facial+salophosphoprotein</pre>
+ * <pre class="code">
+&lt;?xml version="1.0" encoding="UTF-8"?&gt;
+&lt;response&gt;
+
+&lt;lst name="responseHeader"&gt;
+   &lt;int name="status"&gt;0&lt;/int&gt;
+   &lt;int name="QTime"&gt;23&lt;/int&gt;
+&lt;/lst&gt;
+&lt;lst name="result"&gt;
+  &lt;lst name="facial"&gt;
+    &lt;int name="frequency"&gt;1&lt;/int&gt;
+    &lt;lst name="suggestions"&gt;
+      &lt;lst name="faciale"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="faucial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="fascial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="facing"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="faciei"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="facialis"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="social"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="facile"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="spacial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="glacial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="marcial"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="facies"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="facio"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+    &lt;/lst&gt;
+  &lt;/lst&gt;
+  &lt;lst name="salophosphoprotein"&gt;
+    &lt;int name="frequency"&gt;0&lt;/int&gt;
+    &lt;lst name="suggestions"&gt; 
+      &lt;lst name="sialophosphoprotein"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="phosphoprotein"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="phosphoproteins"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+      &lt;lst name="alphalipoprotein"&gt;&lt;int name="frequency"&gt;1&lt;/int&gt;&lt;/lst&gt;
+    &lt;/lst&gt;
+  &lt;/lst&gt;
+&lt;/lst&gt;
+&lt;/response&gt;  </pre>
+
+ * 
 * @see <a href="http://wiki.apache.org/jakarta-lucene/SpellChecker">The Lucene Spellchecker documentation</a>
 *
 */
@ -64,22 +202,37 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {
   * return only the words more frequent than this.
   * 
   */
-  private boolean onlyMorePopular = false;

  private Directory spellcheckerIndexDir = new RAMDirectory();
  private String dirDescription = "(ramdir)";
  private String termSourceField;
+
+  private static final String PREFIX = "sp.";
+  private static final String QUERY_PREFIX = PREFIX + "query.";
+  private static final String DICTIONARY_PREFIX = PREFIX + "dictionary.";
+
+  private static final String SOURCE_FIELD = DICTIONARY_PREFIX + "termSourceField";
+  private static final String INDEX_DIR = DICTIONARY_PREFIX + "indexDir";
+  private static final String THRESHOLD = DICTIONARY_PREFIX + "threshold";
+
+  private static final String ACCURACY = QUERY_PREFIX + "accuracy";
+  private static final String SUGGESTIONS = QUERY_PREFIX + "suggestionCount";
+  private static final String POPULAR = QUERY_PREFIX + "onlyMorePopular";
+  private static final String EXTENDED = QUERY_PREFIX + "extendedResults";
+
  private static final float DEFAULT_ACCURACY = 0.5f;
-  private static final int DEFAULT_NUM_SUGGESTIONS = 1;
+  private static final int DEFAULT_SUGGESTION_COUNT = 1;
  private static final boolean DEFAULT_MORE_POPULAR = false;
+  private static final boolean DEFAULT_EXTENDED_RESULTS = false;
+  private static final float DEFAULT_DICTIONARY_THRESHOLD = 0.0f;

  public void init(NamedList args) {
    super.init(args);
    SolrParams p = SolrParams.toSolrParams(args);
-    termSourceField = p.get("termSourceField");
+    termSourceField = p.get(SOURCE_FIELD, p.get("termSourceField"));

    try {
-      String dir = p.get("spellcheckerIndexDir");
+      String dir = p.get(INDEX_DIR, p.get("spellcheckerIndexDir"));
      if (null != dir) {
        File f = new File(dir);
        if ( ! f.isAbsolute() ) {
@ -97,6 +250,10 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {
    }
  }

+  /**
+   * Processes the following query string parameters: q, multiWords, cmd rebuild,
+   * cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular.
+   */
  public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
    throws Exception {
    SolrParams p = req.getParams();
@ -115,36 +272,78 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {
      }
    }

+    // empty query string
+    if (null == words || "".equals(words.trim())) {
+      return;
+    }
+
    IndexReader indexReader = null;
    String suggestionField = null;
    Float accuracy;
    int numSug;
+    boolean onlyMorePopular;
+    boolean extendedResults;
    try {
-      accuracy = p.getFloat("accuracy", DEFAULT_ACCURACY);
+      accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY));
      spellChecker.setAccuracy(accuracy);
    } catch (NumberFormatException e) {
      throw new RuntimeException("Accuracy must be a valid positive float", e);
    }
    try {
-      numSug = p.getInt("suggestionCount", DEFAULT_NUM_SUGGESTIONS);
+      numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT));
    } catch (NumberFormatException e) {
      throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e);
    }
    try {
-      onlyMorePopular = p.getBool("onlyMorePopular", DEFAULT_MORE_POPULAR);
-    } catch (NumberFormatException e) {
+      onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR);
+    } catch (SolrException e) {
      throw new RuntimeException("'Only more popular' must be a valid boolean", e);
    }
+    try {
+      extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS);
+    } catch (SolrException e) {
+      throw new RuntimeException("'Extended results' must be a valid boolean", e);
+    }

-    // when searching for more popular, a non null index-reader and
+   // when searching for more popular, a non null index-reader and
    // restricted-field are required
-    if (onlyMorePopular) {
+    if (onlyMorePopular || extendedResults) {
      indexReader = req.getSearcher().getReader();
      suggestionField = termSourceField;
    }

+    if (extendedResults) {

-    if (null != words && !"".equals(words.trim())) {
+      SimpleOrderedMap<Object> results = new SimpleOrderedMap<Object>();
+      String[] wordz = words.split(" ");
+      for (String word : wordz)
+      {
+        SimpleOrderedMap<Object> nl = new SimpleOrderedMap<Object>();
+        nl.add("frequency", indexReader.docFreq(new Term(suggestionField, word)));
+        String[] suggestions =
+          spellChecker.suggestSimilar(word, numSug,
+          indexReader, suggestionField, onlyMorePopular);
+
+        // suggestion array
+        NamedList<Object> sa = new NamedList<Object>();
+        for (int i=0; i<suggestions.length; i++) {
+          // suggestion item
+          SimpleOrderedMap<Object> si = new SimpleOrderedMap<Object>();
+          si.add("frequency", indexReader.docFreq(new Term(termSourceField, suggestions[i])));
+          sa.add(suggestions[i], si);
+        }
+        nl.add("suggestions", sa);
+        results.add(word, nl);
+      }
+      rsp.add( "result", results );
+
+    } else {
+      rsp.add("words", words);
+      if (spellChecker.exist(words)) {
+        rsp.add("exist","true");
+      } else {
+        rsp.add("exist","false");
+      }
      String[] suggestions =
        spellChecker.suggestSimilar(words, numSug,
                                    indexReader, suggestionField,
@ -156,6 +355,7 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {

  /** Rebuilds the SpellChecker index using values from the <code>termSourceField</code> from the
   * index pointed to by the current {@link IndexSearcher}.
+   * Any word appearing in less that thresh documents will not be added to the spellcheck index.
   */
  private void rebuild(SolrQueryRequest req) throws IOException, SolrException {
    if (null == termSourceField) {
@ -163,8 +363,15 @@ public class SpellCheckerRequestHandler extends RequestHandlerBase {
        (SolrException.ErrorCode.SERVER_ERROR, "can't rebuild spellchecker index without termSourceField configured");
    }
      
+    Float threshold;
+    try {
+      threshold = req.getParams().getFloat("sp.dictionary.threshold", DEFAULT_DICTIONARY_THRESHOLD);
+    } catch (NumberFormatException e) {
+      throw new RuntimeException("Threshold must be a valid positive float", e);
+    }
+
    IndexReader indexReader = req.getSearcher().getReader();
-    Dictionary dictionary = new LuceneDictionary(indexReader, termSourceField);
+    Dictionary dictionary = new HiFrequencyDictionary(indexReader, termSourceField, threshold);
    spellChecker.clearIndex();
    spellChecker.indexDictionary(dictionary);
    reopen();
--- a/src/java/org/apache/solr/util/HiFrequencyDictionary.java
+++ b/src/java/org/apache/solr/util/HiFrequencyDictionary.java
@ -0,0 +1,140 @@
+package org.apache.solr.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.search.spell.Dictionary;
+
+/**
+ * Hi Frequency Dictionary: terms taken from the given field
+ * of a Lucene index, which appear in a number of documents
+ * above a given threshold.
+ *
+ * When using IndexReader.terms(Term) the code must not call next() on TermEnum
+ * as the first call to TermEnum, see: http://issues.apache.org/jira/browse/LUCENE-6
+ *
+ * Threshold is a value in [0..1] representing the minimum
+ * number of documents (of the total) where a term should appear.
+ *
+ * @author Mike Krimerman
+ *
+ * Based on LuceneDictionary, by
+ * @author Nicolas Maisonneuve
+ * @author Christian Mallwitz
+ */
+public class HiFrequencyDictionary implements Dictionary {
+  private IndexReader reader;
+  private String field;
+  private float thresh;
+
+  public HiFrequencyDictionary(IndexReader reader, String field, float thresh) {
+    this.reader = reader;
+    this.field = field.intern();
+    this.thresh = thresh;
+  }
+
+  public final Iterator getWordsIterator() {
+    return new HiFrequencyIterator();
+  }
+
+
+  final class HiFrequencyIterator implements Iterator {
+    private TermEnum termEnum;
+    private Term actualTerm;
+    private boolean hasNextCalled;
+    private int minNumDocs;
+
+    HiFrequencyIterator() {
+      try {
+        termEnum = reader.terms(new Term(field, ""));
+        minNumDocs = (int)(thresh * (float)reader.numDocs());
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    private boolean isFrequent(Term term) {
+      try {
+        return reader.docFreq(term) >= minNumDocs;
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    public Object next() {
+      if (!hasNextCalled) {
+        hasNext();
+      }
+      hasNextCalled = false;
+
+      try {
+        termEnum.next();
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+
+      return (actualTerm != null) ? actualTerm.text() : null;
+    }
+
+    public boolean hasNext() {
+      if (hasNextCalled) {
+        return actualTerm != null;
+      }
+      hasNextCalled = true;
+
+      do {
+        actualTerm = termEnum.term();
+
+        // if there are no words return false
+        if (actualTerm == null) {
+          return false;
+        }
+
+        String currentField = actualTerm.field();
+
+        // if the next word doesn't have the same field return false
+        if (currentField != field) {
+          actualTerm = null;
+          return false;
+        }
+
+        // got a valid term, does it pass the threshold?
+        if (isFrequent(actualTerm)) {
+          return true;
+        }
+
+        // term not up to threshold
+        try {
+          termEnum.next();
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+
+      } while (true);
+    }
+
+    public void remove() {
+      throw new UnsupportedOperationException();
+    }
+  }
+}
--- a/src/test/org/apache/solr/handler/SpellCheckerRequestHandlerTest.java
+++ b/src/test/org/apache/solr/handler/SpellCheckerRequestHandlerTest.java
@ -0,0 +1,473 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.handler;
+
+import org.apache.solr.util.AbstractSolrTestCase;
+
+/**
+ * This is a test case to test the SpellCheckerRequestHandler class.
+ * It tests: 
+ * <ul>
+ *   <li>The generation of the spell checkers list with a 10 words</li>
+ *   <li>The identification of the word that was being spell checked</li>
+ *   <li>The confirmation if the word exists or not in the index</li>
+ *   <li>The suggested list of a correctly and incorrectly spelled words</li>
+ *   <li>The suggestions for both correct and incorrect words</li>
+ *   <li>The limitation on the number of suggestions with the 
+ *       suggestionCount parameter</li>
+ *   <li>The usage of the parameter multiWords</li>
+ * </ul>
+ * 
+ * Notes/Concerns about this Test Case:
+ * <ul>
+ *   <li>This is my first test case for a Solr Handler.  As such I am not
+ *       familiar with the AbstractSolrTestCase and as such I am not
+ *       100% these test cases will work under the same for each person
+ *       who runs the test cases (see next note).</li>
+ *   <li>The order of the arrays (arr) may not be consistant on other 
+ *       systems or different runs, as such these test cases may fail?</li>
+ *   <li>Note: I changed //arr/str[1][.='cart'] to //arr/str[.='cart'] and it 
+ *       appears to work.</li>
+ *   <li>The two notations appear to successfully test for the same thing: 
+ *       "//lst[@name='result']/lst[1][@name='word']/str[@name='words'][.='cat']" 
+ *       and "//str[@name='words'][.='cat']" which I would think // would indicate 
+ *       a root node.</li>
+ * </ul>
+ */
+public class SpellCheckerRequestHandlerTest 
+  extends AbstractSolrTestCase 
+{
+
+  @Override
+  public String getSchemaFile() { return "solr/conf/schema-spellchecker.xml"; } 
+  
+  @Override
+  public String getSolrConfigFile() { return "solr/conf/solrconfig-spellchecker.xml"; }
+  
+  @Override 
+  public void setUp() throws Exception {
+      super.setUp();
+      
+
+    }
+
+  private void buildSpellCheckIndex()
+  {
+    lrf = h.getRequestFactory("spellchecker", 0, 20 );
+    lrf.args.put("version","2.0");
+    lrf.args.put("sp.query.accuracy",".9");
+
+    assertU("Add some words to the Spell Check Index:",
+        adoc("id",  "100",
+             "spell", "solr"));
+      assertU(adoc("id",  "101",
+                   "spell", "cat"));
+      assertU(adoc("id",  "102",
+                   "spell", "cart"));
+      assertU(adoc("id",  "103",
+                   "spell", "carp"));
+      assertU(adoc("id",  "104",
+                   "spell", "cant"));
+      assertU(adoc("id",  "105",
+                   "spell", "catnip"));
+      assertU(adoc("id",  "106",
+                   "spell", "cattails"));
+      assertU(adoc("id",  "107",
+                   "spell", "cod"));
+      assertU(adoc("id",  "108",
+                   "spell", "corn"));
+      assertU(adoc("id",  "109",
+                   "spell", "cot"));
+
+      assertU(commit());
+      assertU(optimize());
+      
+      lrf.args.put("cmd","rebuild");
+      assertQ("Need to first build the index:",
+              req("cat")
+              ,"//str[@name='cmdExecuted'][.='rebuild']"
+              ,"//str[@name='words'][.='cat']"
+              ,"//str[@name='exist'][.='true']"
+      //        ,"//arr[@name='suggestions'][.='']"
+              );
+      lrf.args.clear();
+
+  }
+  
+  /**
+   * Test for correct spelling of a single word at various accuracy levels
+   * to see how the suggestions vary.
+   */
+  public void testSpellCheck_01_correctWords() {
+    
+    buildSpellCheckIndex();
+    
+    lrf = h.getRequestFactory("spellchecker", 0, 20 );
+    lrf.args.put("version","2.0");
+    
+    lrf.args.put("sp.query.accuracy",".9");
+    assertQ("Failed to spell check",
+            req("cat")
+            ,"//str[@name='words'][.='cat']"
+            ,"//str[@name='exist'][.='true']"
+            );
+
+    lrf.args.put("sp.query.accuracy",".4");
+    assertQ("Failed to spell check",
+            req("cat")
+            ,"//str[@name='words'][.='cat']"
+            ,"//str[@name='exist'][.='true']"
+            ,"//arr/str[.='cot']"
+            ,"//arr/str[.='cart']"
+//            ,"//arr/str[1][.='cot']"
+//            ,"//arr/str[2][.='cart']"
+            );
+
+    lrf.args.put("sp.query.accuracy",".0");
+    assertQ("Failed to spell check",
+            req("cat")
+            ,"//str[@name='words'][.='cat']"
+            ,"//str[@name='exist'][.='true']"
+            ,"//arr/str[.='cart']"
+            ,"//arr/str[.='cot']"
+            ,"//arr/str[.='carp']"
+            ,"//arr/str[.='cod']"
+            ,"//arr/str[.='corn']"
+            );
+  }
+
+  /**
+   * Test for correct spelling of a single word at various accuracy levels
+   * to see how the suggestions vary.
+   */
+  public void testSpellCheck_02_incorrectWords() {
+    
+    buildSpellCheckIndex();
+
+    lrf = h.getRequestFactory("spellchecker", 0, 20 );
+    lrf.args.put("version","2.0");
+    lrf.args.put("sp.query.accuracy",".9");
+    
+    assertQ("Confirm the index is still valid",
+            req("cat")
+            ,"//str[@name='words'][.='cat']"
+            ,"//str[@name='exist'][.='true']"
+            );
+    
+    
+    assertQ("Failed to spell check",
+            req("coat")
+            ,"//str[@name='words'][.='coat']"
+            ,"//str[@name='exist'][.='false']"
+            ,"//arr[@name='suggestions'][.='']"
+            );
+    
+ 
+    lrf.args.put("sp.query.accuracy",".2");
+    assertQ("Failed to spell check",
+            req("coat")
+            ,"//str[@name='words'][.='coat']"
+            ,"//str[@name='exist'][.='false']"
+            ,"//arr/str[.='cot']"
+            ,"//arr/str[.='cat']"
+            ,"//arr/str[.='corn']"
+            ,"//arr/str[.='cart']"
+            ,"//arr/str[.='cod']"
+            ,"//arr/str[.='solr']"
+            ,"//arr/str[.='carp']"
+            );
+
+    lrf.args.put("sp.query.suggestionCount", "2");
+    lrf.args.put("sp.query.accuracy",".2");
+    assertQ("Failed to spell check",
+            req("coat")
+            ,"//str[@name='words'][.='coat']"
+            ,"//str[@name='exist'][.='false']"
+            ,"//arr/str[.='cot']"
+            ,"//arr/str[.='cat']"
+            );
+  }
+
+  /**
+   * Test for correct spelling of a single word at various accuracy levels
+   * to see how the suggestions vary.
+   */
+  public void testSpellCheck_03_multiWords_correctWords() {
+    
+    buildSpellCheckIndex();
+
+    lrf = h.getRequestFactory("spellchecker", 0, 20 );
+    lrf.args.put("version","2.0");
+    lrf.args.put("sp.query.accuracy",".9");
+    
+    assertQ("Confirm the index is still valid",
+            req("cat")
+            ,"//str[@name='words'][.='cat']"
+            ,"//str[@name='exist'][.='true']"
+            );
+    
+    
+    // Enable multiWords formatting:
+    lrf.args.put("sp.query.extendedResults", "true");
+    
+    
+    assertQ("Failed to spell check",
+            req("cat")
+            ,"//lst[@name='cat']"
+            ,"//lst[@name='cat']/int[@name='frequency'][.>0]"
+            ,"//lst[@name='cat']/lst[@name='suggestions' and count(lst)=0]"
+            );
+    
+ 
+    // Please note that the following produces the following XML structure.
+    //  <response>
+    //    <responseHeader>
+    //      <status>0</status><QTime>0</QTime>
+    //    </responseHeader>
+    //    <lst name="result">
+    //      <lst name="cat">
+    //        <int name="frequency">1</int>
+    //        <lst name="suggestions">
+    //          <lst name="cart"><int name="frequency">1</int></lst>
+    //          <lst name="cot"><int name="frequency">1</int></lst>
+    //          <lst name="cod"><int name="frequency">1</int></lst>
+    //          <lst name="carp"><int name="frequency">1</int></lst>
+    //        </lst>
+    //      </lst>
+    //    </lst>
+    //  </response>
+
+
+    lrf.args.put("sp.query.accuracy",".2");
+    assertQ("Failed to spell check",
+            req("cat")
+            ,"//lst[@name='cat']"
+            ,"//lst[@name='cat']/int[@name='frequency'][.>0]"
+            ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cart']/int[@name='frequency'][.>0]"
+            ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cot']/int[@name='frequency'][.>0]"
+            ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cod']/int[@name='frequency'][.>0]"
+            ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='carp']/int[@name='frequency'][.>0]"
+            );
+
+    lrf.args.put("sp.query.suggestionCount", "2");
+    lrf.args.put("sp.query.accuracy",".2");
+    assertQ("Failed to spell check",
+            req("cat")
+            ,"//lst[@name='cat']"
+            ,"//lst[@name='cat']/int[@name='frequency'][.>0]"
+            ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cart']"
+            ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cot']"
+            );
+
+    /* The following is the generated XML response for the next query with three words:
+      <response>
+        <responseHeader><status>0</status><QTime>0</QTime></responseHeader>
+        <lst name="result">
+          <lst name="cat">
+            <int name="frequency">1</int>
+            <lst name="suggestions">
+              <lst name="cart"><int name="frequency">1</int></lst>
+              <lst name="cot"><int name="frequency">1</int></lst>
+            </lst>
+          </lst>
+          <lst name="card">
+            <int name="frequency">1</int>
+            <lst name="suggestions">
+              <lst name="carp"><int name="frequency">1</int></lst>
+              <lst name="cat"><int name="frequency">1</int></lst>
+            </lst>
+          </lst>
+          <lst name="carp">
+            <int name="frequency">1</int>
+            <lst name="suggestions">
+              <lst name="cart"><int name="frequency">1</int></lst>
+              <lst name="corn"><int name="frequency">1</int></lst>
+            </lst>
+          </lst>
+        </lst>
+      </response>
+    */
+
+    lrf.args.put("sp.query.suggestionCount", "2");
+    lrf.args.put("sp.query.accuracy",".2");
+    assertQ("Failed to spell check",
+        req("cat cart carp")
+        ,"//lst[@name='cat']"
+        ,"//lst[@name='cat']/int[@name='frequency'][.>0]"
+        ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cart']"
+        ,"//lst[@name='cat']/lst[@name='suggestions']/lst[@name='cot']"
+
+        ,"//lst[@name='cart']"
+        ,"//lst[@name='cart']/int[@name='frequency'][.>0]"
+        ,"//lst[@name='cart']/lst/lst[1]"
+        ,"//lst[@name='cart']/lst/lst[2]"
+
+        ,"//lst[@name='carp']"
+        ,"//lst[@name='carp']/int[@name='frequency'][.>0]"
+        ,"//lst[@name='carp']/lst[@name='suggestions']/lst[@name='cart']"
+        ,"//lst[@name='carp']/lst[@name='suggestions']/lst[@name='corn']"
+
+    );
+
+  }
+  
+  /**
+   * Test for correct spelling of a single word at various accuracy levels
+   * to see how the suggestions vary.
+   */
+  public void testSpellCheck_04_multiWords_incorrectWords() {
+    
+    buildSpellCheckIndex();
+
+    lrf = h.getRequestFactory("spellchecker", 0, 20 );
+    lrf.args.put("version","2.0");
+    lrf.args.put("sp.query.accuracy",".9");
+    
+    assertQ("Confirm the index is still valid",
+            req("cat")
+            ,"//str[@name='words'][.='cat']"
+            ,"//str[@name='exist'][.='true']"
+            );
+    
+    
+    // Enable multiWords formatting:
+    lrf.args.put("sp.query.extendedResults", "true");
+    
+    
+    assertQ("Failed to spell check",
+            req("coat")
+            ,"//lst[@name='coat']"
+            ,"//lst[@name='coat']/int[@name='frequency'][.=0]"
+            ,"//lst[@name='coat']/lst[@name='suggestions' and count(lst)=0]"
+            );
+ 
+    lrf.args.put("sp.query.accuracy",".2");
+    assertQ("Failed to spell check",
+            req("coat")
+            ,"//lst[@name='coat']"
+            ,"//lst[@name='coat']/int[@name='frequency'][.=0]"
+            ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cot']"
+            ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cat']"
+            ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='corn']"
+            ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cart']"
+            );
+
+    lrf.args.put("sp.query.suggestionCount", "2");
+    lrf.args.put("sp.query.accuracy",".2");
+    assertQ("Failed to spell check",
+            req("coat")
+            ,"//lst[@name='coat']"
+            ,"//lst[@name='coat']/int[@name='frequency'][.=0]"
+            ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cot']"
+            ,"//lst[@name='coat']/lst[@name='suggestions']/lst[@name='cat']"
+            );
+
+  
+  
+    lrf.args.put("sp.query.suggestionCount", "2");
+    lrf.args.put("sp.query.accuracy",".2");
+    assertQ("Failed to spell check",
+        req("cet cert corp")
+        ,"//lst[@name='cet']"
+        ,"//lst[@name='cet']/int[@name='frequency'][.=0]"
+        ,"//lst[@name='cet']/lst[@name='suggestions']/lst[1]"
+        ,"//lst[@name='cet']/lst[@name='suggestions']/lst[2]"
+  
+        ,"//lst[@name='cert']"
+        ,"//lst[@name='cert']/int[@name='frequency'][.=0]"
+        ,"//lst[@name='cert']/lst[@name='suggestions']/lst[1]"
+        ,"//lst[@name='cert']/lst[@name='suggestions']/lst[2]"
+  
+        ,"//lst[@name='corp']"
+        ,"//lst[@name='corp']/int[@name='frequency'][.=0]"
+        ,"//lst[@name='corp']/lst[@name='suggestions']/lst[1]"
+        ,"//lst[@name='corp']/lst[@name='suggestions']/lst[2]"
+  
+      );
+  
+  }
+
+  public void testSpellCheck_05_buildDictionary() {
+    lrf = h.getRequestFactory("spellchecker", 0, 20 );
+    lrf.args.put("version","2.0");
+    lrf.args.put("sp.query.accuracy",".9");
+
+    assertU("Add some words to the Spell Check Index:",
+      adoc("id",  "100",
+             "spell", "solr cat cart"));
+    assertU(adoc("id",  "101",
+                   "spell", "cat cart"));
+    assertU(adoc("id",  "102",
+                   "spell", "cat cart"));
+    assertU(adoc("id",  "103",
+                   "spell", "cat cart carp"));
+    assertU(adoc("id",  "104",
+                   "spell", "cat car cant"));
+    assertU(adoc("id",  "105",
+                   "spell", "cat catnip"));
+    assertU(adoc("id",  "106",
+                   "spell", "cat cattails"));
+    assertU(adoc("id",  "107",
+                   "spell", "cat cod"));
+    assertU(adoc("id",  "108",
+                   "spell", "cat corn"));
+    assertU(adoc("id",  "109",
+                   "spell", "cat cot"));
+    assertU(commit());
+    assertU(optimize());
+
+    lrf.args.put("sp.dictionary.threshold", "0.20");
+    lrf.args.put("cmd","rebuild");
+    assertQ("Need to first build the index:",
+            req("cat")
+            ,"//str[@name='cmdExecuted'][.='rebuild']"
+            ,"//str[@name='words'][.='cat']"
+            ,"//str[@name='exist'][.='true']"
+            );
+
+    lrf.args.clear();
+    lrf.args.put("version","2.0");
+    lrf.args.put("sp.query.accuracy",".9");
+
+    assertQ("Confirm index contains only words above threshold",
+            req("cat")
+            ,"//str[@name='words'][.='cat']"
+            ,"//str[@name='exist'][.='true']"
+            );
+
+    assertQ("Confirm index contains only words above threshold",
+            req("cart")
+            ,"//str[@name='words'][.='cart']"
+            ,"//str[@name='exist'][.='true']"
+            );
+
+    assertQ("Confirm index contains only words above threshold",
+            req("cod")
+            ,"//str[@name='words'][.='cod']"
+            ,"//str[@name='exist'][.='false']"
+            );
+
+    assertQ("Confirm index contains only words above threshold",
+            req("corn")
+            ,"//str[@name='words'][.='corn']"
+            ,"//str[@name='exist'][.='false']"
+            );
+
+    lrf.args.clear();
+  }
+}
--- a/src/test/test-files/solr/conf/schema-spellchecker.xml
+++ b/src/test/test-files/solr/conf/schema-spellchecker.xml
@ -0,0 +1,83 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- This is the Solr schema file. This file should be named "schema.xml" and
+ should be in the conf directory under the solr home
+ (i.e. ./solr/conf/schema.xml by default) 
+ or located where the classloader for the Solr webapp can find it.
+
+ For more information, on how to customize this file, please see
+ http://wiki.apache.org/solr/SchemaXml
+-->
+
+<schema name="Solr SpellCheck Test" version="1.1">
+  <!-- attribute "name" is the name of this schema and is only used for display purposes.
+       Applications should change this to reflect the nature of the search collection.
+       version="1.1" is Solr's version number for the schema syntax and semantics.  It should
+       not normally be changed by applications.
+       1.0: multiValued attribute did not exist, all fields are multiValued by nature
+       1.1: multiValued attribute introduced, false by default -->
+
+  <types>
+    <fieldtype name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+
+    <fieldtype name="text" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.StandardFilterFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.StopFilterFactory"/>
+        <filter class="solr.EnglishPorterFilterFactory"/>
+      </analyzer>
+    </fieldtype>
+
+	<fieldType name="spellText" class="solr.TextField" positionIncrementGap="100">
+	  <analyzer type="index">
+	    <tokenizer class="solr.StandardTokenizerFactory"/>
+	    <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+	    <filter class="solr.StandardFilterFactory"/>
+	    <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+	  </analyzer>
+	  <analyzer type="query">
+	    <tokenizer class="solr.StandardTokenizerFactory"/>
+	    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+	    <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
+	    <filter class="solr.StandardFilterFactory"/>
+	    <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+	  </analyzer>
+	</fieldType>
+	
+ </types>
+
+
+ <fields>
+   <field name="id" type="string" indexed="true" stored="true"/>
+   <field name="spell" type="spellText" indexed="true" stored="true" />
+   <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
+ </fields>
+
+ <!-- field to use to determine and enforce document uniqueness. -->
+ <uniqueKey>id</uniqueKey>
+
+ <!-- field for the QueryParser to use when an explicit fieldname is absent -->
+ <defaultSearchField>text</defaultSearchField>
+
+ <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
+ <solrQueryParser defaultOperator="OR"/>
+
+</schema>
--- a/src/test/test-files/solr/conf/solrconfig-spellchecker.xml
+++ b/src/test/test-files/solr/conf/solrconfig-spellchecker.xml
@ -0,0 +1,103 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<config>
+  <indexDefaults>
+    <useCompoundFile>false</useCompoundFile>
+    <mergeFactor>10</mergeFactor>
+    <maxBufferedDocs>1000</maxBufferedDocs>
+    <maxMergeDocs>2147483647</maxMergeDocs>
+    <maxFieldLength>10000</maxFieldLength>
+    <writeLockTimeout>1000</writeLockTimeout>
+    <commitLockTimeout>10000</commitLockTimeout>
+  </indexDefaults>
+
+  <mainIndex>
+    <useCompoundFile>false</useCompoundFile>
+    <mergeFactor>10</mergeFactor>
+    <maxBufferedDocs>1000</maxBufferedDocs>
+    <maxMergeDocs>2147483647</maxMergeDocs>
+    <maxFieldLength>10000</maxFieldLength>
+    <unlockOnStartup>true</unlockOnStartup>
+  </mainIndex>
+
+
+  <updateHandler class="solr.DirectUpdateHandler2">
+    <commitIntervalLowerBound>0</commitIntervalLowerBound>
+  </updateHandler>
+
+
+  <query>
+    <maxBooleanClauses>1024</maxBooleanClauses>
+    <useFilterForSortedQuery>true</useFilterForSortedQuery>
+    <queryResultWindowSize>10</queryResultWindowSize>
+    <HashDocSet maxSize="3000" loadFactor="0.75"/>
+    <boolTofilterOptimizer enabled="true" cacheSize="32" threshold=".05"/>
+  </query>
+  
+
+ 
+  <requestHandler name="standard" class="solr.StandardRequestHandler" />
+  <requestHandler name="/update" class="solr.XmlUpdateRequestHandler" />
+
+
+  <!-- SpellCheckerRequestHandler takes in a word (or several words) as the
+       value of the "q" parameter and returns a list of alternative spelling
+       suggestions.  If invoked with a ...&cmd=rebuild, it will rebuild the
+       spellchecker index.
+  -->
+  <requestHandler name="spellchecker" class="solr.SpellCheckerRequestHandler" startup="lazy">
+    <!-- default values for query parameters -->
+     <lst name="defaults">
+       <int name="sp.query.suggestionCount">20</int>
+       <float name="sp.query.accuracy">0.60</float>
+     </lst>
+     
+     <!-- Main init params for handler -->
+     
+     <!-- The directory where your SpellChecker Index should live.   -->
+     <!-- May be absolute, or relative to the Solr "dataDir" directory. -->
+     <!-- If this option is not specified, a RAM directory will be used -->
+     <str name="sp.dictionary.spellcheckerIndexDir">spell</str>
+     
+     <!-- the field in your schema that you want to be able to build -->
+     <!-- your spell index on. This should be a field that uses a very -->
+     <!-- simple FieldType without a lot of Analysis (ie: string) -->
+     <str name="sp.dictionary.termSourceField">spell</str>
+
+     <!-- threshold for word to make it into the dictionary -->
+     <!-- a word should appear at minimum in the specified precent of documents -->
+     <str name="sp.dictionary.threshold">0.0</str>
+
+   </requestHandler>
+
+  
+
+  <queryResponseWriter name="standard" class="org.apache.solr.request.XMLResponseWriter"/>
+  <queryResponseWriter name="useless" class="org.apache.solr.OutputWriterTest$UselessOutputWriter"/>
+  <queryResponseWriter name="xslt" class="org.apache.solr.request.XSLTResponseWriter"/>
+  <queryResponseWriter name="json" class="org.apache.solr.request.JSONResponseWriter"/>
+
+    
+  <!-- config for the admin interface --> 
+  <admin>
+    <defaultQuery>solr</defaultQuery>
+    <gettableFiles>solrconfig.xml schema.xml admin-extra.html</gettableFiles>
+  </admin>
+
+</config>