- Reformatted

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@424183 13f79535-47bb-0310-9956-ffa450edef68
2006-07-21 05:14:59 +00:00 · 2006-07-21 05:14:59 +00:00 · de09962750
parent 482ad148f9
commit de09962750
1 changed files with 267 additions and 279 deletions
--- a/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
+++ b/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java
@ -1,6 +1,5 @@
 package org.apache.lucene.search.spell;

-
 /**
 * Copyright 2002-2004 The Apache Software Foundation
 *
@ -17,7 +16,6 @@ package org.apache.lucene.search.spell;
 * limitations under the License.
 */

-
 import java.io.IOException;

 import org.apache.lucene.analysis.WhitespaceAnalyzer;
@ -35,14 +33,13 @@ import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.Directory;
 import java.util.*;

-
 /**
- *  <p>
+ * <p>
 *   Spell Checker class  (Main class) <br/>
- *   (initially inspired by the David Spencer code).
- *  </p>
- *  
- *  <p>Example Usage:
+ *  (initially inspired by the David Spencer code).
+ * </p>
+ *
+ * <p>Example Usage:
 * 
 * <pre>
 *  SpellChecker spellchecker = new SpellChecker(spellIndexDirectory);
@ -58,303 +55,294 @@ import java.util.*;
 */
 public class SpellChecker {

-    /**
-     * Field name for each word in the ngram index.
-     */
-    public static final String F_WORD="word";
+  /**
+   * Field name for each word in the ngram index.
+   */
+  public static final String F_WORD = "word";

+  /**
+   * the spell index
+   */
+  Directory spellindex;

-    /**
-     * the spell index
-     */
-    Directory spellindex;
+  /**
+   * Boost value for start and end grams
+   */
+  private float bStart = 2.0f;

-    /**
-     * Boost value for start and end grams
-     */private float bStart=2.0f;
-      private float bEnd=1.0f;
-  
+  private float bEnd = 1.0f;

-    private IndexReader reader;
-    float min=0.5f;
+  private IndexReader reader;

-    public void setSpellIndex (Directory spellindex) {
-        this.spellindex=spellindex;
+  float min = 0.5f;
+
+  public void setSpellIndex(Directory spellindex) {
+    this.spellindex = spellindex;
+  }
+
+  /**
+   *  Set the accuracy 0 &lt; min &lt; 1; default 0.5
+   */
+  public void setAccuraty(float min) {
+    this.min = min;
+  }
+
+  public SpellChecker(Directory gramIndex) {
+    this.setSpellIndex(gramIndex);
+  }
+
+  /**
+   * Suggest similar words
+   * @param word String the word you want a spell check done on
+   * @param num_sug int the number of suggest words
+   * @throws IOException
+   * @return String[]
+   */
+  public String[] suggestSimilar(String word, int num_sug) throws IOException {
+    return this.suggestSimilar(word, num_sug, null, null, false);
+  }
+
+  /**
+   * Suggest similar words (restricted or not to a field of a user index)
+   * @param word String the word you want a spell check done on
+   * @param num_sug int the number of suggest words
+   * @param ir the indexReader of the user index (can be null see field param)
+   * @param field String the field of the user index: if field is not null, the suggested
+   * words are restricted to the words present in this field.
+   * @param morePopular boolean return only the suggest words that are more frequent than the searched word
+   * (only if restricted mode = (indexReader!=null and field!=null)
+   * @throws IOException
+   * @return String[] the sorted list of the suggest words with this 2 criteria:
+   * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
+   * of the suggest words in the field of the user index
+   */
+  public String[] suggestSimilar(String word, int num_sug, IndexReader ir,
+      String field, boolean morePopular) throws IOException {
+
+    float min = this.min;
+    final TRStringDistance sd = new TRStringDistance(word);
+    final int lengthWord = word.length();
+
+    final int goalFreq = (morePopular && ir != null) ? ir.docFreq(new Term(
+        field, word)) : 0;
+    if (!morePopular && goalFreq > 0) {
+      return new String[] { word }; // return the word if it exist in the index and i don't want a more popular word
    }

+    BooleanQuery query = new BooleanQuery();
+    String[] grams;
+    String key;
+
+    for (int ng = getMin(lengthWord); ng <= getMax(lengthWord); ng++) {
+
+      key = "gram" + ng; // form key
+
+      grams = formGrams(word, ng); // form word into ngrams (allow dups too)
+
+      if (grams.length == 0) {
+        continue; // hmm
+      }
+
+      if (bStart > 0) { // should we boost prefixes?
+        add(query, "start" + ng, grams[0], bStart); // matches start of word
+
+      }
+      if (bEnd > 0) { // should we boost suffixes
+        add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
+
+      }
+      for (int i = 0; i < grams.length; i++) {
+        add(query, key, grams[i]);
+      }

-    /**
-     *  Set the accuracy 0 &lt; min &lt; 1; default 0.5
-     */
-    public void setAccuraty (float min) {
-        this.min=min;
    }

+    IndexSearcher searcher = new IndexSearcher(this.spellindex);
+    Hits hits = searcher.search(query);
+    SuggestWordQueue sugqueue = new SuggestWordQueue(num_sug);

-    public SpellChecker (Directory gramIndex) {
-        this.setSpellIndex(gramIndex);
-    }
+    int stop = Math.min(hits.length(), 10 * num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
+    SuggestWord sugword = new SuggestWord();
+    for (int i = 0; i < stop; i++) {

+      sugword.string = hits.doc(i).get(F_WORD); // get orig word)

-    /**
-     * Suggest similar words
-     * @param word String the word you want a spell check done on
-     * @param num_sug int the number of suggest words
-     * @throws IOException
-     * @return String[]
-     */
-    public String[] suggestSimilar (String word, int num_sug) throws IOException {
-        return this.suggestSimilar(word, num_sug, null, null, false);
-    }
+      if (sugword.string.equals(word)) {
+        continue; // don't suggest a word for itself, that would be silly
+      }

+      //edit distance/normalize with the min word length
+      sugword.score = 1.0f - ((float) sd.getDistance(sugword.string) / Math
+          .min(sugword.string.length(), lengthWord));
+      if (sugword.score < min) {
+        continue;
+      }

-    /**
-     * Suggest similar words (restricted or not to a field of a user index)
-     * @param word String the word you want a spell check done on
-     * @param num_sug int the number of suggest words
-     * @param ir the indexReader of the user index (can be null see field param)
-     * @param field String the field of the user index: if field is not null, the suggested
-     * words are restricted to the words present in this field.
-     * @param morePopular boolean return only the suggest words that are more frequent than the searched word
-     * (only if restricted mode = (indexReader!=null and field!=null)
-     * @throws IOException
-     * @return String[] the sorted list of the suggest words with this 2 criteria:
-     * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
-     * of the suggest words in the field of the user index
-     */
-    public String[] suggestSimilar (String word, int num_sug, IndexReader ir, String field
-    , boolean morePopular) throws IOException {
-
-        float min = this.min;
-        final TRStringDistance sd=new TRStringDistance(word);
-        final int lengthWord=word.length();
-
-        final int goalFreq=(morePopular&&ir!=null)?ir.docFreq(new Term(field, word)):0;
-        if (!morePopular&&goalFreq>0) {
-            return new String[] {
-            word}; // return the word if it exist in the index and i don't want a more popular word
+      if (ir != null) { // use the user index
+        sugword.freq = ir.docFreq(new Term(field, sugword.string)); // freq in the index
+        if ((morePopular && goalFreq > sugword.freq) || sugword.freq < 1) { // don't suggest a word that is not present in the field
+          continue;
        }
-
-        BooleanQuery query=new BooleanQuery();
-        String[] grams;
-        String key;
-
-        for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) {
-
-            key="gram"+ng; // form key
-
-            grams=formGrams(word, ng); // form word into ngrams (allow dups too)
-
-            if (grams.length==0) {
-                continue; // hmm
-            }
-
-            if (bStart>0) { // should we boost prefixes?
-                add(query, "start"+ng, grams[0], bStart); // matches start of word
-
-            }
-            if (bEnd>0) { // should we boost suffixes
-                add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word
-
-            }
-            for (int i=0; i<grams.length; i++) {
-                add(query, key, grams[i]);
-            }
-
-        }
-
-        IndexSearcher searcher=new IndexSearcher(this.spellindex);
-        Hits hits=searcher.search(query);
-        SuggestWordQueue sugqueue=new SuggestWordQueue(num_sug);
-
-        int stop=Math.min(hits.length(), 10*num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
-        SuggestWord sugword=new SuggestWord();
-        for (int i=0; i<stop; i++) {
-
-            sugword.string=hits.doc(i).get(F_WORD); // get orig word)
-
-            if (sugword.string.equals(word)) {
-                continue; // don't suggest a word for itself, that would be silly
-            }
-
-            //edit distance/normalize with the min word length
-            sugword.score=1.0f-((float) sd.getDistance(sugword.string)/Math.min(sugword.string.length(), lengthWord));
-            if (sugword.score<min) {
-                continue;
-            }
-
-            if (ir!=null) { // use the user index
-                sugword.freq=ir.docFreq(new Term(field, sugword.string)); // freq in the index
-                if ((morePopular&&goalFreq>sugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field
-                    continue;
-                }
-            }
-            sugqueue.insert(sugword);
-            if (sugqueue.size()==num_sug) {
-                //if queue full , maintain the min score
-                min=((SuggestWord) sugqueue.top()).score;
-            }
-            sugword=new SuggestWord();
-        }
-
-        // convert to array string
-        String[] list=new String[sugqueue.size()];
-        for (int i=sugqueue.size()-1; i>=0; i--) {
-            list[i]=((SuggestWord) sugqueue.pop()).string;
-        }
-
-        searcher.close();
-        return list;
+      }
+      sugqueue.insert(sugword);
+      if (sugqueue.size() == num_sug) {
+        //if queue full , maintain the min score
+        min = ((SuggestWord) sugqueue.top()).score;
+      }
+      sugword = new SuggestWord();
    }

-
-    /**
-     * Add a clause to a boolean query.
-     */
-    private static void add (BooleanQuery q, String k, String v, float boost) {
-        Query tq=new TermQuery(new Term(k, v));
-        tq.setBoost(boost);
-        q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
+    // convert to array string
+    String[] list = new String[sugqueue.size()];
+    for (int i = sugqueue.size() - 1; i >= 0; i--) {
+      list[i] = ((SuggestWord) sugqueue.pop()).string;
    }

+    searcher.close();
+    return list;
+  }

-    /**
-     * Add a clause to a boolean query.
-     */
-    private static void add (BooleanQuery q, String k, String v) {
-        q.add(new BooleanClause(new TermQuery(new Term(k, v)), BooleanClause.Occur.SHOULD));
+  /**
+   * Add a clause to a boolean query.
+   */
+  private static void add(BooleanQuery q, String k, String v, float boost) {
+    Query tq = new TermQuery(new Term(k, v));
+    tq.setBoost(boost);
+    q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
+  }
+
+  /**
+   * Add a clause to a boolean query.
+   */
+  private static void add(BooleanQuery q, String k, String v) {
+    q.add(new BooleanClause(new TermQuery(new Term(k, v)),
+        BooleanClause.Occur.SHOULD));
+  }
+
+  /**
+   * Form all ngrams for a given word.
+   * @param text the word to parse
+   * @param ng the ngram length e.g. 3
+   * @return an array of all ngrams in the word and note that duplicates are not removed
+   */
+  private static String[] formGrams(String text, int ng) {
+    int len = text.length();
+    String[] res = new String[len - ng + 1];
+    for (int i = 0; i < len - ng + 1; i++) {
+      res[i] = text.substring(i, i + ng);
    }
+    return res;
+  }

+  public void clearIndex() throws IOException {
+    IndexReader.unlock(spellindex);
+    IndexWriter writer = new IndexWriter(spellindex, null, true);
+    writer.close();
+  }

-    /**
-     * Form all ngrams for a given word.
-     * @param text the word to parse
-     * @param ng the ngram length e.g. 3
-     * @return an array of all ngrams in the word and note that duplicates are not removed
-     */
-    private static String[] formGrams (String text, int ng) {
-        int len=text.length();
-        String[] res=new String[len-ng+1];
-        for (int i=0; i<len-ng+1; i++) {
-            res[i]=text.substring(i, i+ng);
+  /**
+   * Check whether the word exists in the index.
+   * @param word String
+   * @throws IOException
+   * @return true iff the word exists in the index
+   */
+  public boolean exist(String word) throws IOException {
+    if (reader == null) {
+      reader = IndexReader.open(spellindex);
+    }
+    return reader.docFreq(new Term(F_WORD, word)) > 0;
+  }
+
+  /**
+   * Index a Dictionary
+   * @param dict the dictionary to index
+   * @throws IOException
+   */
+  public void indexDictionary(Dictionary dict) throws IOException {
+    IndexReader.unlock(spellindex);
+    IndexWriter writer = new IndexWriter(spellindex, new WhitespaceAnalyzer(),
+        !IndexReader.indexExists(spellindex));
+    writer.setMergeFactor(300);
+    writer.setMaxBufferedDocs(150);
+
+    Iterator iter = dict.getWordsIterator();
+    while (iter.hasNext()) {
+      String word = (String) iter.next();
+
+      int len = word.length();
+      if (len < 3) {
+        continue; // too short we bail but "too long" is fine...
+      }
+
+      if (this.exist(word)) { // if the word already exist in the gramindex
+        continue;
+      }
+
+      // ok index the word
+      Document doc = createDocument(word, getMin(len), getMax(len));
+      writer.addDocument(doc);
+    }
+    // close writer
+    writer.optimize();
+    writer.close();
+
+    // close reader
+    //        reader.close();
+    //        reader=null;
+  }
+
+  private int getMin(int l) {
+    if (l > 5) {
+      return 3;
+    }
+    if (l == 5) {
+      return 2;
+    }
+    return 1;
+  }
+
+  private int getMax(int l) {
+    if (l > 5) {
+      return 4;
+    }
+    if (l == 5) {
+      return 3;
+    }
+    return 2;
+  }
+
+  private static Document createDocument(String text, int ng1, int ng2) {
+    Document doc = new Document();
+    doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
+    addGram(text, doc, ng1, ng2);
+    return doc;
+  }
+
+  private static void addGram(String text, Document doc, int ng1, int ng2) {
+    int len = text.length();
+    for (int ng = ng1; ng <= ng2; ng++) {
+      String key = "gram" + ng;
+      String end = null;
+      for (int i = 0; i < len - ng + 1; i++) {
+        String gram = text.substring(i, i + ng);
+        doc
+            .add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
+        if (i == 0) {
+          doc.add(new Field("start" + ng, gram, Field.Store.YES,
+              Field.Index.UN_TOKENIZED));
        }
-        return res;
+        end = gram;
+      }
+      if (end != null) { // may not be present if len==ng1
+        doc.add(new Field("end" + ng, end, Field.Store.YES,
+            Field.Index.UN_TOKENIZED));
+      }
    }
+  }

-
-    public void clearIndex () throws IOException {
-        IndexReader.unlock(spellindex);
-        IndexWriter writer=new IndexWriter(spellindex, null, true);
-        writer.close();
+  protected void finalize() throws Throwable {
+    if (reader != null) {
+      reader.close();
    }
-
-
-    /**
-     * Check whether the word exists in the index.
-     * @param word String
-     * @throws IOException
-     * @return true iff the word exists in the index
-     */
-    public boolean exist (String word) throws IOException {
-        if (reader==null) {
-            reader=IndexReader.open(spellindex);
-        }
-        return reader.docFreq(new Term(F_WORD, word))>0;
-    }
-
-
-    /**
-     * Index a Dictionary
-     * @param dict the dictionary to index
-     * @throws IOException
-     */
-    public void indexDictionary (Dictionary dict) throws IOException {
-        IndexReader.unlock(spellindex);
-        IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex));
-        writer.setMergeFactor(300);
-        writer.setMaxBufferedDocs(150);
-
-        Iterator iter=dict.getWordsIterator();
-        while (iter.hasNext()) {
-            String word=(String) iter.next();
-
-            int len=word.length();
-            if (len<3) {
-                continue; // too short we bail but "too long" is fine...
-            }
-
-            if (this.exist(word)) { // if the word already exist in the gramindex
-                continue;
-            }
-
-            // ok index the word
-            Document doc=createDocument(word, getMin(len), getMax(len));
-            writer.addDocument(doc);
-        }
-        // close writer
-        writer.optimize();
-        writer.close();
-
-        // close reader
-        reader.close();
-        reader=null;
-    }
-
-
-    private int getMin (int l) {
-        if (l>5) {
-            return 3;
-        }
-        if (l==5) {
-            return 2;
-        }
-        return 1;
-    }
-
-
-    private int getMax (int l) {
-        if (l>5) {
-            return 4;
-        }
-        if (l==5) {
-            return 3;
-        }
-        return 2;
-    }
-
-
-    private static Document createDocument (String text, int ng1, int ng2) {
-        Document doc=new Document();
-        doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
-        addGram(text, doc, ng1, ng2);
-        return doc;
-    }
-
-
-    private static void addGram (String text, Document doc, int ng1, int ng2) {
-        int len=text.length();
-        for (int ng=ng1; ng<=ng2; ng++) {
-            String key="gram"+ng;
-            String end=null;
-            for (int i=0; i<len-ng+1; i++) {
-                String gram=text.substring(i, i+ng);
-                doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
-                if (i==0) {
-                    doc.add(new Field("start"+ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
-                }
-                end=gram;
-            }
-            if (end!=null) { // may not be present if len==ng1
-                doc.add(new Field("end"+ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));
-            }
-        }
-    }
-
-
-    protected void finalize() throws Throwable {
-        if (reader!=null) {
-            reader.close();
-        }
-    }
-
+  }
 }