From 57cd0765650d0f73db37e090eaed4748fcd2fc82 Mon Sep 17 00:00:00 2001
From: David Spencer <dspencer@apache.org>
Date: Tue, 2 Nov 2004 23:11:29 +0000
Subject: [PATCH] prelim checking of spellchecker, v1.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151014 13f79535-47bb-0310-9956-ffa450edef68
---
 sandbox/contributions/spellchecker/build.xml  | 156 ++++++++
 .../lucene/search/spell/Dictionary.java       |  33 ++
 .../lucene/search/spell/LuceneDictionary.java |  94 +++++
 .../search/spell/PlainTextDictionary.java     |  86 +++++
 .../lucene/search/spell/SpellChecker.java     | 363 ++++++++++++++++++
 .../lucene/search/spell/SuggestWord.java      |  64 +++
 .../lucene/search/spell/SuggestWordQueue.java |  41 ++
 .../lucene/search/spell/TRStringDistance.java | 132 +++++++
 .../lucene/search/spell/TestSpellChecker.java | 122 ++++++
 9 files changed, 1091 insertions(+)
 create mode 100755 sandbox/contributions/spellchecker/build.xml
 create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java
 create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
 create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java
 create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
 create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java
 create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java
 create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
 create mode 100755 sandbox/contributions/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
diff --git a/sandbox/contributions/spellchecker/build.xml b/sandbox/contributions/spellchecker/build.xml
new file mode 100755
index 00000000000..d6f8917e16a
--- /dev/null
+++ b/sandbox/contributions/spellchecker/build.xml
@@ -0,0 +1,156 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<project basedir="." default="rebuild" name="Spelling checker">
+
+<property name="lucene.lib" value="d:/dev/lib/lucene.jar"/>
+<property name="lucenetest.lib" value="D:/dev/jakarta-lucene/build/classes/test"/>
+
+
+<property name="name" value="spellchecker"/>
+<property name="Name" value="spellchecker"/>
+<property name="version" value="1.1"/>
+<property name="year" value="2004"/>
+<property name="final.name" value="${name}-${version}"/>
+<property name="java" location="src/java"/>
+<property name="test" location="src/test"/>
+<property name="build.dir" location="build"/>
+<property name="build.java" location="${build.dir}/classes/java"/>
+<property name="build.test" location="${build.dir}/classes/test"/>
+<property name="build.javadocs" location="doc"/>
+<property name="javadoc.link" value="http://java.sun.com/j2se/1.4/docs/api/"/>
+<property name="javac.debug" value="off"/>
+<property name="junit.output.dir" location="${build.dir}/test"/>
+<property name="junit.reports" location="${build.dir}/test/reports"/>
+
+
+
+   <!-- Build classpath -->
+  <path id="classpath">
+    <pathelement location="${lucene.lib}"/>
+
+    <pathelement location="${build.java}"/>
+  </path>
+
+  <path id="test.classpath">
+    <path refid="classpath"/>
+	<pathelement location="${lucenetest.lib}"/>
+    <pathelement location="${build.dir}/classes/test"/>
+  </path>
+  <!--Patternset to exclude files from the output directory:-->
+
+  <!-- ================================================================== -->
+  <!-- C O M P I L E                                                      -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="javacompile"
+    description="Compiles core classes">
+    <mkdir dir="${build.java}"/>
+    <javac
+      srcdir="${java}"
+      includes="**/*.java"
+      destdir="${build.java}"
+      debug="${javac.debug}"
+	  optimize="on">
+      <classpath refid="classpath"/>
+    </javac>
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- J A R                                                              -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="jar"  depends="javacompile" description="Generates the Jar file">
+    <jar 
+	destfile="${build.dir}/${final.name}.jar"
+    basedir="${build.java}" />
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- J A V A D O C                                                      -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="javadoc">
+    <mkdir dir="${build.javadocs}"/>
+    <javadoc
+      sourcepath="${java}"
+      overview="src/java/overview.html"
+      packagenames="org.apache.lucene.*"
+      destdir="${build.javadocs}"
+      author="true"
+      version="true"
+      use="true"
+      link="${javadoc.link}"
+      windowtitle="${Name} ${version} API"
+      doctitle="${Name} ${version} API"
+      bottom="Author: Nicolas Maisonneuve (${year})"  >
+      </javadoc>
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- C L E A N                                                          -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="clean">
+    <delete failonerror="false" includeemptydirs="true">
+      <fileset dir="${build.dir}"/>
+    </delete>
+  </target>
+
+
+  <!-- ================================================================== -->
+  <!-- B U I L D  T E S T                                                 -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="compile-test" depends="javacompile">
+    <mkdir dir="${build.test}"/>
+    <javac
+      srcdir="${test}"
+      includes="**/*.java"
+      destdir="${build.test}"
+      debug="true">
+      <classpath refid="test.classpath"/>
+    </javac>
+  </target>
+
+  <!-- ================================================================== -->
+  <!-- R U N  T E S T S                                                   -->
+  <!-- ================================================================== -->
+  <!--                                                                    -->
+  <!-- ================================================================== -->
+  <target name="test" depends="compile-test" description="Runs unit tests">
+    <fail unless="junit.present">
+      ##################################################################
+      JUnit not found.
+      Please make sure junit.jar is in ANT_HOME/lib, or made available
+      to Ant using other mechanisms like -lib or CLASSPATH.
+      ##################################################################
+	  </fail>
+    <mkdir dir="${junit.output.dir}"/>
+    <junit printsummary="off" haltonfailure="no"
+      errorProperty="tests.failed" failureProperty="tests.failed">
+      <classpath refid="junit.classpath"/>
+      <sysproperty key="dataDir" file="src/test"/>
+      <sysproperty key="tempDir" file="${build.dir}/test"/>
+      <formatter type="xml"/>
+      <formatter type="brief" usefile="false"/>
+      <batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
+        <fileset dir="src/test" includes="**/Test*.java"/>
+      </batchtest>
+      <batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
+        <fileset dir="src/test" includes="**/${testcase}.java"/>
+      </batchtest>
+    </junit>
+
+    <fail if="tests.failed">Tests failed!</fail>
+  </target>
+
+  <target depends="javacompile" name="make"/>
+
+  <target depends="clean,make" name="rebuild"/>
+
+</project>
\ No newline at end of file
diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java
new file mode 100755
index 00000000000..979621abdeb
--- /dev/null
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java
@@ -0,0 +1,33 @@
+package org.apache.lucene.search.spell;
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Iterator;
+
+/**
+ * A simple interface representing a Dictionary
+ * @author Nicolas Maisonneuve
+ * @version 1.0
+ */
+public interface Dictionary {
+
+    /**
+     * return all the words present in the dictionnary
+     * @return Iterator
+     */
+    public Iterator getWordsIterator();
+
+}
diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
new file mode 100755
index 00000000000..d94cedbd5e9
--- /dev/null
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
@@ -0,0 +1,94 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.IndexReader;
+import java.util.Iterator;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+import java.io.*;
+
+/**
+ *  Lucene Dictionnary
+ * @author Nicolas Maisonneuve
+ */
+public class LuceneDictionary
+implements Dictionary {
+    IndexReader reader;
+    String field;
+
+    public LuceneDictionary (IndexReader reader, String field) {
+        this.reader=reader;
+        this.field=field;
+
+    }
+
+
+    public final Iterator getWordsIterator () {
+        return new LuceneIterator();
+    }
+
+
+final  class LuceneIterator    implements Iterator {
+      private  TermEnum enum;
+      private  Term actualTerm;
+      private  boolean has_next_called;
+
+        public LuceneIterator () {
+            try {
+                enum=reader.terms(new Term(field, ""));
+            }
+            catch (IOException ex) {
+                ex.printStackTrace();
+            }
+        }
+
+
+        public Object next () {
+            if (!has_next_called)  {hasNext();}
+             has_next_called=false;
+            return (actualTerm!=null) ? actualTerm.text(): null;
+        }
+
+
+        public boolean hasNext () {
+             has_next_called=true;
+            try {
+                // if there is still words
+                if (!enum.next()) {
+                    actualTerm=null;
+                    return false;
+                }
+                //  if the next word are in the field
+                actualTerm=enum.term();
+                String fieldt=actualTerm.field();
+                if (fieldt!=field) {
+                    actualTerm=null;
+                    return false;
+                }
+                return true;
+            }
+            catch (IOException ex) {
+                ex.printStackTrace();
+                return false;
+            }
+        }
+
+
+        public void remove () {};
+    }
+}
diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java
new file mode 100755
index 00000000000..230b923744e
--- /dev/null
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java
@@ -0,0 +1,86 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.Iterator;
+import java.io.InputStream;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.*;
+
+
+/**
+ * dictionary represented by a file text
+ * Format allowed: 1 word per line:
+ * word1
+ * word2
+ * word3
+ *
+ * @author Nicolas Maisonneuve
+ */
+public class PlainTextDictionary implements Dictionary {
+
+    private BufferedReader in;
+    private String line;
+    private boolean has_next_called;
+
+    public PlainTextDictionary (File file) throws FileNotFoundException {
+        in=new BufferedReader(new FileReader(file));
+    }
+
+
+    public PlainTextDictionary (InputStream dictFile) {
+        in=new BufferedReader(new InputStreamReader(System.in));
+    }
+
+
+    public Iterator getWordsIterator () {
+
+        return new fileIterator();
+    }
+
+
+    final class fileIterator
+    implements Iterator {
+        public Object next () {
+            if (!has_next_called) {
+                hasNext();
+            }
+            has_next_called=false;
+            return line;
+        }
+
+
+        public boolean hasNext () {
+            has_next_called=true;
+            try {
+                line=in.readLine();
+            }
+            catch (IOException ex) {
+                ex.printStackTrace();
+                line=null;
+                return false;
+            }
+            return (line!=null)?true:false;
+        }
+
+
+        public void remove () {};
+    }
+
+}
diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
new file mode 100755
index 00000000000..93be0a6a4fd
--- /dev/null
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
@@ -0,0 +1,363 @@
+package org.apache.lucene.search.spell;
+
+
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import java.util.*;
+
+
+/**
+ *  <p>
+ *	Spell Checker class  (Main class) <br/>
+ * (initially inspired by the David Spencer code)
+ *  </p>
+ *  
+ *  <p>
+ *  Spell Checker spellchecker= new SpellChecker (spellDirectory);<br/>
+ *  <br/>
+ *  //To index a field of a user index <br/>
+ *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));<br/>
+ *<br/>
+ *   //To index a file containing words  <br/>
+ *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));<br/>
+ *</p>
+ * 
+ * @author Nicolas Maisonneuve
+ * @version 1.0
+ */
+public class SpellChecker {
+
+    /**
+     * Field name for each word in the ngram index.
+     */
+    public static final String F_WORD="word";
+
+
+    /**
+     * the spell index
+     */
+    Directory spellindex;
+
+    /**
+     * Boost value for start and end grams
+     */private float bStart=2.0f;
+      private float bEnd=1.0f;
+  
+
+    private IndexReader reader;
+    float min=0.5f;
+
+    public void setSpellIndex (Directory spellindex) {
+        this.spellindex=spellindex;
+    }
+
+
+    /**
+     *  Set the accuraty 0<min<1 default 0.5
+     * @param min float
+     */
+    public void setAccuraty (float min) {
+        this.min=min;
+    }
+
+
+    public SpellChecker (Directory gramIndex) {
+        this.setSpellIndex(gramIndex);
+    }
+
+
+    /**
+     * Suggest similar words
+     * @param word String the word you want a spell check done on
+     * @param num_sug int the number of suggest words
+     * @throws IOException
+     * @return String[]
+     */
+    public String[] suggestSimilar (String word, int num_sug) throws IOException {
+        return this.suggestSimilar(word, num_sug, null, null, false);
+    }
+
+
+    /**
+     * Suggest similar words (restricted or not of a field of a user index)
+     * @param word String the word you want a spell check done on
+     * @param num_sug int the number of suggest words
+     * @param IndexReader the indexReader of the user index (can be null see field param)
+     * @param field String the field of the user index: if field is not null ,the suggest
+     * words are restricted to the words present in this field.
+     * @param morePopular boolean return only the suggest words that are more frequent than the searched word
+     * (only if restricted mode = (indexReader!=null and field!=null)
+     * @throws IOException
+     * @return String[] the sorted list of the suggest words with this 2 criteri
+     * first criteria : the edit distance, second criteria (only if restricted mode): the popularity
+     * of the suggest words in the field of the user index
+     */
+    public String[] suggestSimilar (String word, int num_sug, IndexReader ir, String field
+    , boolean morePopular) throws IOException {
+
+        final TRStringDistance sd=new TRStringDistance(word);
+        final int lengthWord=word.length();
+
+        final int goalFreq=(morePopular&&ir!=null)?ir.docFreq(new Term(field, word)):0;
+        if (!morePopular&&goalFreq>0) {
+            return new String[] {
+            word}; // return the word if it exist in the index and i don't want a more popular word
+        }
+
+        BooleanQuery query=new BooleanQuery();
+        String[] grams;
+        String key;
+
+        for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) {
+
+            key="gram"+ng; // form key
+
+            grams=formGrams(word, ng); // form word into ngrams (allow dups too)
+
+            if (grams.length==0) {
+                continue; // hmm
+            }
+
+            if (bStart>0) { // should we boost prefixes?
+                add(query, "start"+ng, grams[0], bStart); // matches start of word
+
+            }
+            if (bEnd>0) { // should we boost suffixes
+                add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word
+
+            }
+            for (int i=0; i<grams.length; i++) {
+                add(query, key, grams[i]);
+            }
+
+        }
+
+        IndexSearcher searcher=new IndexSearcher(this.spellindex);
+        Hits hits=searcher.search(query);
+        SuggestWordQueue sugqueue=new SuggestWordQueue(num_sug);
+
+        int stop=Math.min(hits.length(), 10*num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
+        SuggestWord sugword=new SuggestWord();
+        for (int i=0; i<stop; i++) {
+
+            sugword.string=hits.doc(i).get(F_WORD); // get orig word)
+
+            if (sugword.string==word) {
+                continue; // don't suggest a word for itself, that would be silly
+            }
+
+            //edit distance/normalize with the min word length
+            sugword.score=1.0f-((float) sd.getDistance(sugword.string)/Math.min(sugword.string.length(), lengthWord));
+            if (sugword.score<min) {
+                continue;
+            }
+
+            if (ir!=null) { // use the user index
+                sugword.freq=ir.docFreq(new Term(field, sugword.string)); // freq in the index
+                if ((morePopular&&goalFreq>sugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field
+                    continue;
+                }
+            }
+            sugqueue.insert(sugword);
+            if (sugqueue.size()==num_sug) {
+                //if queue full , maintain the min score
+                min=((SuggestWord) sugqueue.top()).score;
+            }
+            sugword=new SuggestWord();
+        }
+
+        // convert to array string
+        String[] list=new String[sugqueue.size()];
+        for (int i=sugqueue.size()-1; i>=0; i--) {
+            list[i]=((SuggestWord) sugqueue.pop()).string;
+        }
+
+        searcher.close();
+        return list;
+    }
+
+
+    /**
+     * Add a clause to a boolean query.
+     */
+    private static void add (BooleanQuery q, String k, String v, float boost) {
+        Query tq=new TermQuery(new Term(k, v));
+        tq.setBoost(boost);
+        q.add(new BooleanClause(tq, false, false));
+    }
+
+
+    /**
+     * Add a clause to a boolean query.
+     */
+    private static void add (BooleanQuery q, String k, String v) {
+        q.add(new BooleanClause(new TermQuery(new Term(k, v)), false, false));
+    }
+
+
+    /**
+     * Form all ngrams for a given word.
+     * @param text the word to parse
+     * @param ng the ngram length e.g. 3
+     * @return an array of all ngrams in the word and note that duplicates are not removed
+     */
+    private static String[] formGrams (String text, int ng) {
+        int len=text.length();
+        String[] res=new String[len-ng+1];
+        for (int i=0; i<len-ng+1; i++) {
+            res[i]=text.substring(i, i+ng);
+        }
+        return res;
+    }
+
+
+    public void clearIndex () throws IOException {
+        IndexReader.unlock(spellindex);
+        IndexWriter writer=new IndexWriter(spellindex, null, true);
+        writer.close();
+    }
+
+
+    /**
+     * if the word exist in the index
+     * @param word String
+     * @throws IOException
+     * @return boolean
+     */
+    public boolean exist (String word) throws IOException {
+        if (reader==null) {
+            reader=IndexReader.open(spellindex);
+        }
+        return reader.docFreq(new Term(F_WORD, word))>0;
+    }
+
+
+    /**
+     * Index a Dictionnary
+     * @param dict the dictionnary to index
+     * @throws IOException
+     */
+    public void indexDictionnary (Dictionary dict) throws IOException {
+
+        int ng1, ng2;
+        IndexReader.unlock(spellindex);
+        IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex));
+        writer.mergeFactor=300;
+        writer.minMergeDocs=150;
+
+        Iterator iter=dict.getWordsIterator();
+        while (iter.hasNext()) {
+            String word=(String) iter.next();
+
+            int len=word.length();
+            if (len<3) {
+                continue; // too short we bail but "too long" is fine...
+            }
+
+            if (this.exist(word)) { // if the word already exist in the gramindex
+                continue;
+            }
+
+            // ok index the word
+            Document doc=createDocument(word, getMin(len), getMax(len));
+            writer.addDocument(doc);
+        }
+        // close writer
+        writer.optimize();
+        writer.close();
+
+        // close reader
+        reader.close();
+        reader=null;
+    }
+
+
+    private int getMin (int l) {
+        if (l>5) {
+            return 3;
+        }
+        if (l==5) {
+            return 2;
+        }
+        return 1;
+    }
+
+
+    private int getMax (int l) {
+        if (l>5) {
+            return 4;
+        }
+        if (l==5) {
+            return 3;
+        }
+        return 2;
+
+    }
+
+
+    private static Document createDocument (String text, int ng1, int ng2) {
+        Document doc=new Document();
+        doc.add(Field.Keyword(F_WORD, text)); // orig term
+        addGram(text, doc, ng1, ng2);
+        return doc;
+    }
+
+
+    private static void addGram (String text, Document doc, int ng1, int ng2) {
+        int len=text.length();
+        for (int ng=ng1; ng<=ng2; ng++) {
+            String key="gram"+ng;
+            String end=null;
+            for (int i=0; i<len-ng+1; i++) {
+                String gram=text.substring(i, i+ng);
+                doc.add(Field.Keyword(key, gram));
+                if (i==0) {
+                    doc.add(Field.Keyword("start"+ng, gram));
+                }
+                end=gram;
+            }
+            if (end!=null) { // may not be present if len==ng1
+                doc.add(Field.Keyword("end"+ng, end));
+            }
+        }
+    }
+
+
+    protected void finalize () throws Throwable {
+        if (reader!=null) {
+            reader.close();
+        }
+    }
+
+}
diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java
new file mode 100755
index 00000000000..722d1a3bc60
--- /dev/null
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java
@@ -0,0 +1,64 @@
+package org.apache.lucene.search.spell;
+
+
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *  SuggestWord Class
+ *  used in suggestSimilat method in SpellChecker class
+ *  @author Nicolas Maisonneuve
+ */
+ final class SuggestWord {
+    /**
+     * the score of the word
+     */
+    public float score;
+
+
+    /**
+     * The freq of the word
+     */
+    public int freq;
+
+
+    /**
+     * the suggested word
+     */
+    public String string;
+
+
+    public final int compareTo (SuggestWord a) {
+        //first criteria: the edit distance
+        if (score>a.score) {
+            return 1;
+        }
+        if (score<a.score) {
+            return-1;
+        }
+
+        //second criteria (if first criteria is equal): the popularity
+        if (freq>a.freq) {
+            return 1;
+        }
+
+        if (freq<a.freq) {
+            return-1;
+        }
+
+        return 0;
+    }
+}
diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java
new file mode 100755
index 00000000000..a96c29dbbc4
--- /dev/null
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java
@@ -0,0 +1,41 @@
+package org.apache.lucene.search.spell;
+
+
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ *  to sort SuggestWord
+ * @author Nicolas Maisonneuve
+ */
+import org.apache.lucene.util.PriorityQueue;
+
+
+final class SuggestWordQueue
+extends PriorityQueue {
+
+    SuggestWordQueue (int size) {
+        initialize(size);
+    }
+
+    protected final boolean lessThan (Object a, Object b) {
+        SuggestWord wa=(SuggestWord) a;
+        SuggestWord wb=(SuggestWord) b;
+        int val=wa.compareTo(wb);
+        return val<0;
+    }
+
+}
diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
new file mode 100755
index 00000000000..992d3bba856
--- /dev/null
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
@@ -0,0 +1,132 @@
+package org.apache.lucene.search.spell;
+
+
+/**
+ * Copyright 2002-2004 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Edit distance  class
+ */
+public final class TRStringDistance {
+
+    final char[] sa;
+    final int n;
+    final int[][][] cache=new int[30][][];
+
+
+    /**
+     * Optimized to run a bit faster than the static getDistance().
+     * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
+     */
+    public TRStringDistance (String target) {
+        sa=target.toCharArray();
+        n=sa.length;
+    }
+
+
+    //*****************************
+     // Compute Levenshtein distance
+     //*****************************
+      public final int getDistance (String other) {
+          int d[][]; // matrix
+          int cost; // cost
+
+          // Step 1
+          final char[] ta=other.toCharArray();
+          final int m=ta.length;
+          if (n==0) {
+              return m;
+          }
+          if (m==0) {
+              return n;
+          }
+
+          if (m>=cache.length) {
+              d=form(n, m);
+          }
+          else if (cache[m]!=null) {
+              d=cache[m];
+          }
+          else {
+              d=cache[m]=form(n, m);
+
+              // Step 3
+
+          }
+          for (int i=1; i<=n; i++) {
+              final char s_i=sa[i-1];
+
+              // Step 4
+
+              for (int j=1; j<=m; j++) {
+                  final char t_j=ta[j-1];
+
+                  // Step 5
+
+                  if (s_i==t_j) { // same
+                      cost=0;
+                  }
+                  else { // not a match
+                      cost=1;
+
+                      // Step 6
+
+                  }
+                  d[i][j]=min3(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost);
+
+              }
+
+          }
+
+          // Step 7
+          return d[n][m];
+
+      }
+
+
+    /**
+     *
+     */
+    private static int[][] form (int n, int m) {
+        int[][] d=new int[n+1][m+1];
+        // Step 2
+
+        for (int i=0; i<=n; i++) {
+            d[i][0]=i;
+
+        }
+        for (int j=0; j<=m; j++) {
+            d[0][j]=j;
+        }
+        return d;
+    }
+
+
+    //****************************
+     // Get minimum of three values
+     //****************************
+      private static int min3 (int a, int b, int c) {
+          int mi=a;
+          if (b<mi) {
+              mi=b;
+          }
+          if (c<mi) {
+              mi=c;
+          }
+          return mi;
+
+      }
+}
diff --git a/sandbox/contributions/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java b/sandbox/contributions/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
new file mode 100755
index 00000000000..f6a2d9c44af
--- /dev/null
+++ b/sandbox/contributions/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
@@ -0,0 +1,122 @@
+package org.apache.lucene.search.spell;
+
+
+import junit.framework.*;
+import org.apache.lucene.search.spell.*;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.util.English;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import java.io.IOException;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.Directory;
+import java.io.File;
+
+
+/**
+ * Test case
+ * @author Nicolas Maisonneuve
+ */
+
+public class TestSpellChecker
+extends TestCase {
+    private SpellChecker spellChecker;
+    Directory userindex, spellindex;
+
+    protected void setUp () throws Exception {
+        super.setUp();
+
+        //create a user index
+        userindex=new RAMDirectory();
+        IndexWriter writer=new IndexWriter(userindex, new SimpleAnalyzer(), true);
+
+        for (int i=0; i<1000; i++) {
+            Document doc=new Document();
+            doc.add(Field.Text("field1", English.intToEnglish(i)));
+            doc.add(Field.Text("field2", English.intToEnglish(i+1))); // + word thousand
+            writer.addDocument(doc);
+        }
+        writer.close();
+
+        // create the spellChecker
+        File file=new File("d://test");
+        spellindex=FSDirectory.getDirectory(file, true);
+        spellChecker=new SpellChecker(spellindex);
+    }
+
+
+    public void testBuild () {
+        try {
+            IndexReader r=IndexReader.open(userindex);
+
+            spellChecker.clearIndex();
+
+            addwords(r, "field1");
+            int num_field1=this.numdoc();
+
+            addwords(r, "field2");
+            int num_field2=this.numdoc();
+
+            this.assertTrue(num_field2==num_field1+1);
+
+            // test small word
+            String[] l=spellChecker.suggestSimilar("fvie", 2);
+            this.assertTrue(l[0].equals("five"));
+
+            l=spellChecker.suggestSimilar("fiv", 2);
+            this.assertTrue(l[0].equals("five"));
+
+            l=spellChecker.suggestSimilar("ive", 2);
+            this.assertTrue(l[0].equals("five"));
+
+            l=spellChecker.suggestSimilar("fives", 2);
+            this.assertTrue(l[0].equals("five"));
+
+            l=spellChecker.suggestSimilar("fie", 2);
+            this.assertTrue(l[0].equals("five"));
+
+            l=spellChecker.suggestSimilar("fi", 2);
+            this.assertEquals(0,l.length);
+
+            // test restreint to a field
+            l=spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
+            this.assertEquals(0,l.length); // there isn't the term thousand in the field field1
+
+            l=spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
+            this.assertEquals(1,l.length); // there is the term thousand in the field field2
+        }
+        catch (IOException e) {
+            e.printStackTrace();
+            this.assertTrue(false);
+        }
+
+    }
+
+
+    private void addwords (IndexReader r, String field) throws IOException {
+        long time=System.currentTimeMillis();
+        spellChecker.indexDictionnary(new LuceneDictionary(r, field));
+        time=System.currentTimeMillis()-time;
+        System.out.println("time to build "+field+": "+time);
+    }
+
+
+    private int numdoc () throws IOException {
+        IndexReader rs=IndexReader.open(spellindex);
+        int num=rs.numDocs();
+        this.assertTrue(num!=0);
+        System.out.println("num docs: "+num);
+        rs.close();
+        return num;
+    }
+
+
+    protected void tearDown () throws Exception {
+        spellChecker=null;
+        super.tearDown();
+    }
+
+}