prelim checking of spellchecker, v1.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151014 13f79535-47bb-0310-9956-ffa450edef68
2004-11-02 23:11:29 +00:00 · 2004-11-02 23:11:29 +00:00 · 57cd076565
parent a8d98638a0
commit 57cd076565
9 changed files with 1091 additions and 0 deletions
--- a/sandbox/contributions/spellchecker/build.xml
+++ b/sandbox/contributions/spellchecker/build.xml
@ -0,0 +1,156 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project basedir="." default="rebuild" name="Spelling checker">
 <property name="lucene.lib" value="d:/dev/lib/lucene.jar"/>
 <property name="lucenetest.lib" value="D:/dev/jakarta-lucene/build/classes/test"/>
 <property name="name" value="spellchecker"/>
 <property name="Name" value="spellchecker"/>
 <property name="version" value="1.1"/>
 <property name="year" value="2004"/>
 <property name="final.name" value="${name}-${version}"/>
 <property name="java" location="src/java"/>
 <property name="test" location="src/test"/>
 <property name="build.dir" location="build"/>
 <property name="build.java" location="${build.dir}/classes/java"/>
 <property name="build.test" location="${build.dir}/classes/test"/>
 <property name="build.javadocs" location="doc"/>
 <property name="javadoc.link" value="http://java.sun.com/j2se/1.4/docs/api/"/>
 <property name="javac.debug" value="off"/>
 <property name="junit.output.dir" location="${build.dir}/test"/>
 <property name="junit.reports" location="${build.dir}/test/reports"/>
   <!-- Build classpath -->
  <path id="classpath">
    <pathelement location="${lucene.lib}"/>
    <pathelement location="${build.java}"/>
  </path>
  <path id="test.classpath">
    <path refid="classpath"/>
 	<pathelement location="${lucenetest.lib}"/>
    <pathelement location="${build.dir}/classes/test"/>
  </path>
  <!--Patternset to exclude files from the output directory:-->
  <!-- ================================================================== -->
  <!-- C O M P I L E                                                      -->
  <!-- ================================================================== -->
  <!--                                                                    -->
  <!-- ================================================================== -->
  <target name="javacompile"
    description="Compiles core classes">
    <mkdir dir="${build.java}"/>
    <javac
      srcdir="${java}"
      includes="**/*.java"
      destdir="${build.java}"
      debug="${javac.debug}"
 	  optimize="on">
      <classpath refid="classpath"/>
    </javac>
  </target>
  <!-- ================================================================== -->
  <!-- J A R                                                              -->
  <!-- ================================================================== -->
  <!--                                                                    -->
  <!-- ================================================================== -->
  <target name="jar"  depends="javacompile" description="Generates the Jar file">
    <jar 
 	destfile="${build.dir}/${final.name}.jar"
    basedir="${build.java}" />
  </target>
  <!-- ================================================================== -->
  <!-- J A V A D O C                                                      -->
  <!-- ================================================================== -->
  <!--                                                                    -->
  <!-- ================================================================== -->
  <target name="javadoc">
    <mkdir dir="${build.javadocs}"/>
    <javadoc
      sourcepath="${java}"
      overview="src/java/overview.html"
      packagenames="org.apache.lucene.*"
      destdir="${build.javadocs}"
      author="true"
      version="true"
      use="true"
      link="${javadoc.link}"
      windowtitle="${Name} ${version} API"
      doctitle="${Name} ${version} API"
      bottom="Author: Nicolas Maisonneuve (${year})"  >
      </javadoc>
  </target>
  <!-- ================================================================== -->
  <!-- C L E A N                                                          -->
  <!-- ================================================================== -->
  <!--                                                                    -->
  <!-- ================================================================== -->
  <target name="clean">
    <delete failonerror="false" includeemptydirs="true">
      <fileset dir="${build.dir}"/>
    </delete>
  </target>
  <!-- ================================================================== -->
  <!-- B U I L D  T E S T                                                 -->
  <!-- ================================================================== -->
  <!--                                                                    -->
  <!-- ================================================================== -->
  <target name="compile-test" depends="javacompile">
    <mkdir dir="${build.test}"/>
    <javac
      srcdir="${test}"
      includes="**/*.java"
      destdir="${build.test}"
      debug="true">
      <classpath refid="test.classpath"/>
    </javac>
  </target>
  <!-- ================================================================== -->
  <!-- R U N  T E S T S                                                   -->
  <!-- ================================================================== -->
  <!--                                                                    -->
  <!-- ================================================================== -->
  <target name="test" depends="compile-test" description="Runs unit tests">
    <fail unless="junit.present">
      ##################################################################
      JUnit not found.
      Please make sure junit.jar is in ANT_HOME/lib, or made available
      to Ant using other mechanisms like -lib or CLASSPATH.
      ##################################################################
 	  </fail>
    <mkdir dir="${junit.output.dir}"/>
    <junit printsummary="off" haltonfailure="no"
      errorProperty="tests.failed" failureProperty="tests.failed">
      <classpath refid="junit.classpath"/>
      <sysproperty key="dataDir" file="src/test"/>
      <sysproperty key="tempDir" file="${build.dir}/test"/>
      <formatter type="xml"/>
      <formatter type="brief" usefile="false"/>
      <batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
        <fileset dir="src/test" includes="**/Test*.java"/>
      </batchtest>
      <batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
        <fileset dir="src/test" includes="**/${testcase}.java"/>
      </batchtest>
    </junit>
    <fail if="tests.failed">Tests failed!</fail>
  </target>
  <target depends="javacompile" name="make"/>
  <target depends="clean,make" name="rebuild"/>
 </project>
--- a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java
@ -0,0 +1,33 @@
 package org.apache.lucene.search.spell;
 /**
 * Copyright 2002-2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.util.Iterator;
 /**
 * A simple interface representing a Dictionary
 * @author Nicolas Maisonneuve
 * @version 1.0
 */
 public interface Dictionary {
    /**
     * return all the words present in the dictionnary
     * @return Iterator
     */
    public Iterator getWordsIterator();
 }
--- a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java
@ -0,0 +1,94 @@
 package org.apache.lucene.search.spell;
 /**
 * Copyright 2002-2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.index.IndexReader;
 import java.util.Iterator;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.Term;
 import java.io.*;
 /**
 *  Lucene Dictionnary
 * @author Nicolas Maisonneuve
 */
 public class LuceneDictionary
 implements Dictionary {
    IndexReader reader;
    String field;
    public LuceneDictionary (IndexReader reader, String field) {
        this.reader=reader;
        this.field=field;
    }
    public final Iterator getWordsIterator () {
        return new LuceneIterator();
    }
 final  class LuceneIterator    implements Iterator {
      private  TermEnum enum;
      private  Term actualTerm;
      private  boolean has_next_called;
        public LuceneIterator () {
            try {
                enum=reader.terms(new Term(field, ""));
            }
            catch (IOException ex) {
                ex.printStackTrace();
            }
        }
        public Object next () {
            if (!has_next_called)  {hasNext();}
             has_next_called=false;
            return (actualTerm!=null) ? actualTerm.text(): null;
        }
        public boolean hasNext () {
             has_next_called=true;
            try {
                // if there is still words
                if (!enum.next()) {
                    actualTerm=null;
                    return false;
                }
                //  if the next word are in the field
                actualTerm=enum.term();
                String fieldt=actualTerm.field();
                if (fieldt!=field) {
                    actualTerm=null;
                    return false;
                }
                return true;
            }
            catch (IOException ex) {
                ex.printStackTrace();
                return false;
            }
        }
        public void remove () {};
    }
 }
--- a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java
@ -0,0 +1,86 @@
 package org.apache.lucene.search.spell;
 /**
 * Copyright 2002-2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.util.Iterator;
 import java.io.InputStream;
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
 import java.io.*;
 /**
 * dictionary represented by a file text
 * Format allowed: 1 word per line:
 * word1
 * word2
 * word3
 *
 * @author Nicolas Maisonneuve
 */
 public class PlainTextDictionary implements Dictionary {
    private BufferedReader in;
    private String line;
    private boolean has_next_called;
    public PlainTextDictionary (File file) throws FileNotFoundException {
        in=new BufferedReader(new FileReader(file));
    }
    public PlainTextDictionary (InputStream dictFile) {
        in=new BufferedReader(new InputStreamReader(System.in));
    }
    public Iterator getWordsIterator () {
        return new fileIterator();
    }
    final class fileIterator
    implements Iterator {
        public Object next () {
            if (!has_next_called) {
                hasNext();
            }
            has_next_called=false;
            return line;
        }
        public boolean hasNext () {
            has_next_called=true;
            try {
                line=in.readLine();
            }
            catch (IOException ex) {
                ex.printStackTrace();
                line=null;
                return false;
            }
            return (line!=null)?true:false;
        }
        public void remove () {};
    }
 }
--- a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
@ -0,0 +1,363 @@
 package org.apache.lucene.search.spell;
 /**
 * Copyright 2002-2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.WhitespaceAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.Hits;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.Directory;
 import java.util.*;
 /**
 *  <p>
 *	Spell Checker class  (Main class) <br/>
 * (initially inspired by the David Spencer code)
 *  </p>
 *  
 *  <p>
 *  Spell Checker spellchecker= new SpellChecker (spellDirectory);<br/>
 *  <br/>
 *  //To index a field of a user index <br/>
 *  spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));<br/>
 *<br/>
 *   //To index a file containing words  <br/>
 *  spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));<br/>
 *</p>
 * 
 * @author Nicolas Maisonneuve
 * @version 1.0
 */
 public class SpellChecker {
    /**
     * Field name for each word in the ngram index.
     */
    public static final String F_WORD="word";
    /**
     * the spell index
     */
    Directory spellindex;
    /**
     * Boost value for start and end grams
     */private float bStart=2.0f;
      private float bEnd=1.0f;
    private IndexReader reader;
    float min=0.5f;
    public void setSpellIndex (Directory spellindex) {
        this.spellindex=spellindex;
    }
    /**
     *  Set the accuraty 0<min<1 default 0.5
     * @param min float
     */
    public void setAccuraty (float min) {
        this.min=min;
    }
    public SpellChecker (Directory gramIndex) {
        this.setSpellIndex(gramIndex);
    }
    /**
     * Suggest similar words
     * @param word String the word you want a spell check done on
     * @param num_sug int the number of suggest words
     * @throws IOException
     * @return String[]
     */
    public String[] suggestSimilar (String word, int num_sug) throws IOException {
        return this.suggestSimilar(word, num_sug, null, null, false);
    }
    /**
     * Suggest similar words (restricted or not of a field of a user index)
     * @param word String the word you want a spell check done on
     * @param num_sug int the number of suggest words
     * @param IndexReader the indexReader of the user index (can be null see field param)
     * @param field String the field of the user index: if field is not null ,the suggest
     * words are restricted to the words present in this field.
     * @param morePopular boolean return only the suggest words that are more frequent than the searched word
     * (only if restricted mode = (indexReader!=null and field!=null)
     * @throws IOException
     * @return String[] the sorted list of the suggest words with this 2 criteri
     * first criteria : the edit distance, second criteria (only if restricted mode): the popularity
     * of the suggest words in the field of the user index
     */
    public String[] suggestSimilar (String word, int num_sug, IndexReader ir, String field
    , boolean morePopular) throws IOException {
        final TRStringDistance sd=new TRStringDistance(word);
        final int lengthWord=word.length();
        final int goalFreq=(morePopular&&ir!=null)?ir.docFreq(new Term(field, word)):0;
        if (!morePopular&&goalFreq>0) {
            return new String[] {
            word}; // return the word if it exist in the index and i don't want a more popular word
        }
        BooleanQuery query=new BooleanQuery();
        String[] grams;
        String key;
        for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) {
            key="gram"+ng; // form key
            grams=formGrams(word, ng); // form word into ngrams (allow dups too)
            if (grams.length==0) {
                continue; // hmm
            }
            if (bStart>0) { // should we boost prefixes?
                add(query, "start"+ng, grams[0], bStart); // matches start of word
            }
            if (bEnd>0) { // should we boost suffixes
                add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word
            }
            for (int i=0; i<grams.length; i++) {
                add(query, key, grams[i]);
            }
        }
        IndexSearcher searcher=new IndexSearcher(this.spellindex);
        Hits hits=searcher.search(query);
        SuggestWordQueue sugqueue=new SuggestWordQueue(num_sug);
        int stop=Math.min(hits.length(), 10*num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
        SuggestWord sugword=new SuggestWord();
        for (int i=0; i<stop; i++) {
            sugword.string=hits.doc(i).get(F_WORD); // get orig word)
            if (sugword.string==word) {
                continue; // don't suggest a word for itself, that would be silly
            }
            //edit distance/normalize with the min word length
            sugword.score=1.0f-((float) sd.getDistance(sugword.string)/Math.min(sugword.string.length(), lengthWord));
            if (sugword.score<min) {
                continue;
            }
            if (ir!=null) { // use the user index
                sugword.freq=ir.docFreq(new Term(field, sugword.string)); // freq in the index
                if ((morePopular&&goalFreq>sugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field
                    continue;
                }
            }
            sugqueue.insert(sugword);
            if (sugqueue.size()==num_sug) {
                //if queue full , maintain the min score
                min=((SuggestWord) sugqueue.top()).score;
            }
            sugword=new SuggestWord();
        }
        // convert to array string
        String[] list=new String[sugqueue.size()];
        for (int i=sugqueue.size()-1; i>=0; i--) {
            list[i]=((SuggestWord) sugqueue.pop()).string;
        }
        searcher.close();
        return list;
    }
    /**
     * Add a clause to a boolean query.
     */
    private static void add (BooleanQuery q, String k, String v, float boost) {
        Query tq=new TermQuery(new Term(k, v));
        tq.setBoost(boost);
        q.add(new BooleanClause(tq, false, false));
    }
    /**
     * Add a clause to a boolean query.
     */
    private static void add (BooleanQuery q, String k, String v) {
        q.add(new BooleanClause(new TermQuery(new Term(k, v)), false, false));
    }
    /**
     * Form all ngrams for a given word.
     * @param text the word to parse
     * @param ng the ngram length e.g. 3
     * @return an array of all ngrams in the word and note that duplicates are not removed
     */
    private static String[] formGrams (String text, int ng) {
        int len=text.length();
        String[] res=new String[len-ng+1];
        for (int i=0; i<len-ng+1; i++) {
            res[i]=text.substring(i, i+ng);
        }
        return res;
    }
    public void clearIndex () throws IOException {
        IndexReader.unlock(spellindex);
        IndexWriter writer=new IndexWriter(spellindex, null, true);
        writer.close();
    }
    /**
     * if the word exist in the index
     * @param word String
     * @throws IOException
     * @return boolean
     */
    public boolean exist (String word) throws IOException {
        if (reader==null) {
            reader=IndexReader.open(spellindex);
        }
        return reader.docFreq(new Term(F_WORD, word))>0;
    }
    /**
     * Index a Dictionnary
     * @param dict the dictionnary to index
     * @throws IOException
     */
    public void indexDictionnary (Dictionary dict) throws IOException {
        int ng1, ng2;
        IndexReader.unlock(spellindex);
        IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex));
        writer.mergeFactor=300;
        writer.minMergeDocs=150;
        Iterator iter=dict.getWordsIterator();
        while (iter.hasNext()) {
            String word=(String) iter.next();
            int len=word.length();
            if (len<3) {
                continue; // too short we bail but "too long" is fine...
            }
            if (this.exist(word)) { // if the word already exist in the gramindex
                continue;
            }
            // ok index the word
            Document doc=createDocument(word, getMin(len), getMax(len));
            writer.addDocument(doc);
        }
        // close writer
        writer.optimize();
        writer.close();
        // close reader
        reader.close();
        reader=null;
    }
    private int getMin (int l) {
        if (l>5) {
            return 3;
        }
        if (l==5) {
            return 2;
        }
        return 1;
    }
    private int getMax (int l) {
        if (l>5) {
            return 4;
        }
        if (l==5) {
            return 3;
        }
        return 2;
    }
    private static Document createDocument (String text, int ng1, int ng2) {
        Document doc=new Document();
        doc.add(Field.Keyword(F_WORD, text)); // orig term
        addGram(text, doc, ng1, ng2);
        return doc;
    }
    private static void addGram (String text, Document doc, int ng1, int ng2) {
        int len=text.length();
        for (int ng=ng1; ng<=ng2; ng++) {
            String key="gram"+ng;
            String end=null;
            for (int i=0; i<len-ng+1; i++) {
                String gram=text.substring(i, i+ng);
                doc.add(Field.Keyword(key, gram));
                if (i==0) {
                    doc.add(Field.Keyword("start"+ng, gram));
                }
                end=gram;
            }
            if (end!=null) { // may not be present if len==ng1
                doc.add(Field.Keyword("end"+ng, end));
            }
        }
    }
    protected void finalize () throws Throwable {
        if (reader!=null) {
            reader.close();
        }
    }
 }
--- a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java
@ -0,0 +1,64 @@
 package org.apache.lucene.search.spell;
 /**
 * Copyright 2002-2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 *  SuggestWord Class
 *  used in suggestSimilat method in SpellChecker class
 *  @author Nicolas Maisonneuve
 */
 final class SuggestWord {
    /**
     * the score of the word
     */
    public float score;
    /**
     * The freq of the word
     */
    public int freq;
    /**
     * the suggested word
     */
    public String string;
    public final int compareTo (SuggestWord a) {
        //first criteria: the edit distance
        if (score>a.score) {
            return 1;
        }
        if (score<a.score) {
            return-1;
        }
        //second criteria (if first criteria is equal): the popularity
        if (freq>a.freq) {
            return 1;
        }
        if (freq<a.freq) {
            return-1;
        }
        return 0;
    }
 }
--- a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java
@ -0,0 +1,41 @@
 package org.apache.lucene.search.spell;
 /**
 * Copyright 2002-2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 *  to sort SuggestWord
 * @author Nicolas Maisonneuve
 */
 import org.apache.lucene.util.PriorityQueue;
 final class SuggestWordQueue
 extends PriorityQueue {
    SuggestWordQueue (int size) {
        initialize(size);
    }
    protected final boolean lessThan (Object a, Object b) {
        SuggestWord wa=(SuggestWord) a;
        SuggestWord wb=(SuggestWord) b;
        int val=wa.compareTo(wb);
        return val<0;
    }
 }
--- a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
+++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java
@ -0,0 +1,132 @@
 package org.apache.lucene.search.spell;
 /**
 * Copyright 2002-2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 /**
 * Edit distance  class
 */
 public final class TRStringDistance {
    final char[] sa;
    final int n;
    final int[][][] cache=new int[30][][];
    /**
     * Optimized to run a bit faster than the static getDistance().
     * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
     */
    public TRStringDistance (String target) {
        sa=target.toCharArray();
        n=sa.length;
    }
    //*****************************
     // Compute Levenshtein distance
     //*****************************
      public final int getDistance (String other) {
          int d[][]; // matrix
          int cost; // cost
          // Step 1
          final char[] ta=other.toCharArray();
          final int m=ta.length;
          if (n==0) {
              return m;
          }
          if (m==0) {
              return n;
          }
          if (m>=cache.length) {
              d=form(n, m);
          }
          else if (cache[m]!=null) {
              d=cache[m];
          }
          else {
              d=cache[m]=form(n, m);
              // Step 3
          }
          for (int i=1; i<=n; i++) {
              final char s_i=sa[i-1];
              // Step 4
              for (int j=1; j<=m; j++) {
                  final char t_j=ta[j-1];
                  // Step 5
                  if (s_i==t_j) { // same
                      cost=0;
                  }
                  else { // not a match
                      cost=1;
                      // Step 6
                  }
                  d[i][j]=min3(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost);
              }
          }
          // Step 7
          return d[n][m];
      }
    /**
     *
     */
    private static int[][] form (int n, int m) {
        int[][] d=new int[n+1][m+1];
        // Step 2
        for (int i=0; i<=n; i++) {
            d[i][0]=i;
        }
        for (int j=0; j<=m; j++) {
            d[0][j]=j;
        }
        return d;
    }
    //****************************
     // Get minimum of three values
     //****************************
      private static int min3 (int a, int b, int c) {
          int mi=a;
          if (b<mi) {
              mi=b;
          }
          if (c<mi) {
              mi=c;
          }
          return mi;
      }
 }
--- a/sandbox/contributions/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
+++ b/sandbox/contributions/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
@ -0,0 +1,122 @@
 package org.apache.lucene.search.spell;
 import junit.framework.*;
 import org.apache.lucene.search.spell.*;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.analysis.SimpleAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.util.English;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.IndexReader;
 import java.io.IOException;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.Directory;
 import java.io.File;
 /**
 * Test case
 * @author Nicolas Maisonneuve
 */
 public class TestSpellChecker
 extends TestCase {
    private SpellChecker spellChecker;
    Directory userindex, spellindex;
    protected void setUp () throws Exception {
        super.setUp();
        //create a user index
        userindex=new RAMDirectory();
        IndexWriter writer=new IndexWriter(userindex, new SimpleAnalyzer(), true);
        for (int i=0; i<1000; i++) {
            Document doc=new Document();
            doc.add(Field.Text("field1", English.intToEnglish(i)));
            doc.add(Field.Text("field2", English.intToEnglish(i+1))); // + word thousand
            writer.addDocument(doc);
        }
        writer.close();
        // create the spellChecker
        File file=new File("d://test");
        spellindex=FSDirectory.getDirectory(file, true);
        spellChecker=new SpellChecker(spellindex);
    }
    public void testBuild () {
        try {
            IndexReader r=IndexReader.open(userindex);
            spellChecker.clearIndex();
            addwords(r, "field1");
            int num_field1=this.numdoc();
            addwords(r, "field2");
            int num_field2=this.numdoc();
            this.assertTrue(num_field2==num_field1+1);
            // test small word
            String[] l=spellChecker.suggestSimilar("fvie", 2);
            this.assertTrue(l[0].equals("five"));
            l=spellChecker.suggestSimilar("fiv", 2);
            this.assertTrue(l[0].equals("five"));
            l=spellChecker.suggestSimilar("ive", 2);
            this.assertTrue(l[0].equals("five"));
            l=spellChecker.suggestSimilar("fives", 2);
            this.assertTrue(l[0].equals("five"));
            l=spellChecker.suggestSimilar("fie", 2);
            this.assertTrue(l[0].equals("five"));
            l=spellChecker.suggestSimilar("fi", 2);
            this.assertEquals(0,l.length);
            // test restreint to a field
            l=spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
            this.assertEquals(0,l.length); // there isn't the term thousand in the field field1
            l=spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
            this.assertEquals(1,l.length); // there is the term thousand in the field field2
        }
        catch (IOException e) {
            e.printStackTrace();
            this.assertTrue(false);
        }
    }
    private void addwords (IndexReader r, String field) throws IOException {
        long time=System.currentTimeMillis();
        spellChecker.indexDictionnary(new LuceneDictionary(r, field));
        time=System.currentTimeMillis()-time;
        System.out.println("time to build "+field+": "+time);
    }
    private int numdoc () throws IOException {
        IndexReader rs=IndexReader.open(spellindex);
        int num=rs.numDocs();
        this.assertTrue(num!=0);
        System.out.println("num docs: "+num);
        rs.close();
        return num;
    }
    protected void tearDown () throws Exception {
        spellChecker=null;
        super.tearDown();
    }
 }