From 57cd0765650d0f73db37e090eaed4748fcd2fc82 Mon Sep 17 00:00:00 2001 From: David Spencer Date: Tue, 2 Nov 2004 23:11:29 +0000 Subject: [PATCH] prelim checking of spellchecker, v1.1 git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151014 13f79535-47bb-0310-9956-ffa450edef68 --- sandbox/contributions/spellchecker/build.xml | 156 ++++++++ .../lucene/search/spell/Dictionary.java | 33 ++ .../lucene/search/spell/LuceneDictionary.java | 94 +++++ .../search/spell/PlainTextDictionary.java | 86 +++++ .../lucene/search/spell/SpellChecker.java | 363 ++++++++++++++++++ .../lucene/search/spell/SuggestWord.java | 64 +++ .../lucene/search/spell/SuggestWordQueue.java | 41 ++ .../lucene/search/spell/TRStringDistance.java | 132 +++++++ .../lucene/search/spell/TestSpellChecker.java | 122 ++++++ 9 files changed, 1091 insertions(+) create mode 100755 sandbox/contributions/spellchecker/build.xml create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java create mode 100755 sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/TRStringDistance.java create mode 100755 sandbox/contributions/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java diff --git a/sandbox/contributions/spellchecker/build.xml b/sandbox/contributions/spellchecker/build.xml new file mode 100755 index 00000000000..d6f8917e16a --- /dev/null +++ b/sandbox/contributions/spellchecker/build.xml @@ -0,0 +1,156 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ################################################################## + JUnit not found. + Please make sure junit.jar is in ANT_HOME/lib, or made available + to Ant using other mechanisms like -lib or CLASSPATH. + ################################################################## + + + + + + + + + + + + + + + + + Tests failed! + + + + + + + \ No newline at end of file diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java new file mode 100755 index 00000000000..979621abdeb --- /dev/null +++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/Dictionary.java @@ -0,0 +1,33 @@ +package org.apache.lucene.search.spell; +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Iterator; + +/** + * A simple interface representing a Dictionary + * @author Nicolas Maisonneuve + * @version 1.0 + */ +public interface Dictionary { + + /** + * return all the words present in the dictionnary + * @return Iterator + */ + public Iterator getWordsIterator(); + +} diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java new file mode 100755 index 00000000000..d94cedbd5e9 --- /dev/null +++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/LuceneDictionary.java @@ -0,0 +1,94 @@ +package org.apache.lucene.search.spell; + +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import java.util.Iterator; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.Term; +import java.io.*; + +/** + * Lucene Dictionnary + * @author Nicolas Maisonneuve + */ +public class LuceneDictionary +implements Dictionary { + IndexReader reader; + String field; + + public LuceneDictionary (IndexReader reader, String field) { + this.reader=reader; + this.field=field; + + } + + + public final Iterator getWordsIterator () { + return new LuceneIterator(); + } + + +final class LuceneIterator implements Iterator { + private TermEnum enum; + private Term actualTerm; + private boolean has_next_called; + + public LuceneIterator () { + try { + enum=reader.terms(new Term(field, "")); + } + catch (IOException ex) { + ex.printStackTrace(); + } + } + + + public Object next () { + if (!has_next_called) {hasNext();} + has_next_called=false; + return (actualTerm!=null) ? actualTerm.text(): null; + } + + + public boolean hasNext () { + has_next_called=true; + try { + // if there is still words + if (!enum.next()) { + actualTerm=null; + return false; + } + // if the next word are in the field + actualTerm=enum.term(); + String fieldt=actualTerm.field(); + if (fieldt!=field) { + actualTerm=null; + return false; + } + return true; + } + catch (IOException ex) { + ex.printStackTrace(); + return false; + } + } + + + public void remove () {}; + } +} diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java new file mode 100755 index 00000000000..230b923744e --- /dev/null +++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/PlainTextDictionary.java @@ -0,0 +1,86 @@ +package org.apache.lucene.search.spell; + +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.util.Iterator; +import java.io.InputStream; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.io.*; + + +/** + * dictionary represented by a file text + * Format allowed: 1 word per line: + * word1 + * word2 + * word3 + * + * @author Nicolas Maisonneuve + */ +public class PlainTextDictionary implements Dictionary { + + private BufferedReader in; + private String line; + private boolean has_next_called; + + public PlainTextDictionary (File file) throws FileNotFoundException { + in=new BufferedReader(new FileReader(file)); + } + + + public PlainTextDictionary (InputStream dictFile) { + in=new BufferedReader(new InputStreamReader(System.in)); + } + + + public Iterator getWordsIterator () { + + return new fileIterator(); + } + + + final class fileIterator + implements Iterator { + public Object next () { + if (!has_next_called) { + hasNext(); + } + has_next_called=false; + return line; + } + + + public boolean hasNext () { + has_next_called=true; + try { + line=in.readLine(); + } + catch (IOException ex) { + ex.printStackTrace(); + line=null; + return false; + } + return (line!=null)?true:false; + } + + + public void remove () {}; + } + +} diff --git a/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java new file mode 100755 index 00000000000..93be0a6a4fd --- /dev/null +++ b/sandbox/contributions/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java @@ -0,0 +1,363 @@ +package org.apache.lucene.search.spell; + + +/** + * Copyright 2002-2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import java.util.*; + + +/** + *

+ * Spell Checker class (Main class)
+ * (initially inspired by the David Spencer code) + *

+ * + *

+ * Spell Checker spellchecker= new SpellChecker (spellDirectory);
+ *
+ * //To index a field of a user index
+ * spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));
+ *
+ * //To index a file containing words
+ * spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));
+ *

+ * + * @author Nicolas Maisonneuve + * @version 1.0 + */ +public class SpellChecker { + + /** + * Field name for each word in the ngram index. + */ + public static final String F_WORD="word"; + + + /** + * the spell index + */ + Directory spellindex; + + /** + * Boost value for start and end grams + */private float bStart=2.0f; + private float bEnd=1.0f; + + + private IndexReader reader; + float min=0.5f; + + public void setSpellIndex (Directory spellindex) { + this.spellindex=spellindex; + } + + + /** + * Set the accuraty 00) { + return new String[] { + word}; // return the word if it exist in the index and i don't want a more popular word + } + + BooleanQuery query=new BooleanQuery(); + String[] grams; + String key; + + for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) { + + key="gram"+ng; // form key + + grams=formGrams(word, ng); // form word into ngrams (allow dups too) + + if (grams.length==0) { + continue; // hmm + } + + if (bStart>0) { // should we boost prefixes? + add(query, "start"+ng, grams[0], bStart); // matches start of word + + } + if (bEnd>0) { // should we boost suffixes + add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word + + } + for (int i=0; isugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field + continue; + } + } + sugqueue.insert(sugword); + if (sugqueue.size()==num_sug) { + //if queue full , maintain the min score + min=((SuggestWord) sugqueue.top()).score; + } + sugword=new SuggestWord(); + } + + // convert to array string + String[] list=new String[sugqueue.size()]; + for (int i=sugqueue.size()-1; i>=0; i--) { + list[i]=((SuggestWord) sugqueue.pop()).string; + } + + searcher.close(); + return list; + } + + + /** + * Add a clause to a boolean query. + */ + private static void add (BooleanQuery q, String k, String v, float boost) { + Query tq=new TermQuery(new Term(k, v)); + tq.setBoost(boost); + q.add(new BooleanClause(tq, false, false)); + } + + + /** + * Add a clause to a boolean query. + */ + private static void add (BooleanQuery q, String k, String v) { + q.add(new BooleanClause(new TermQuery(new Term(k, v)), false, false)); + } + + + /** + * Form all ngrams for a given word. + * @param text the word to parse + * @param ng the ngram length e.g. 3 + * @return an array of all ngrams in the word and note that duplicates are not removed + */ + private static String[] formGrams (String text, int ng) { + int len=text.length(); + String[] res=new String[len-ng+1]; + for (int i=0; i0; + } + + + /** + * Index a Dictionnary + * @param dict the dictionnary to index + * @throws IOException + */ + public void indexDictionnary (Dictionary dict) throws IOException { + + int ng1, ng2; + IndexReader.unlock(spellindex); + IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex)); + writer.mergeFactor=300; + writer.minMergeDocs=150; + + Iterator iter=dict.getWordsIterator(); + while (iter.hasNext()) { + String word=(String) iter.next(); + + int len=word.length(); + if (len<3) { + continue; // too short we bail but "too long" is fine... + } + + if (this.exist(word)) { // if the word already exist in the gramindex + continue; + } + + // ok index the word + Document doc=createDocument(word, getMin(len), getMax(len)); + writer.addDocument(doc); + } + // close writer + writer.optimize(); + writer.close(); + + // close reader + reader.close(); + reader=null; + } + + + private int getMin (int l) { + if (l>5) { + return 3; + } + if (l==5) { + return 2; + } + return 1; + } + + + private int getMax (int l) { + if (l>5) { + return 4; + } + if (l==5) { + return 3; + } + return 2; + + } + + + private static Document createDocument (String text, int ng1, int ng2) { + Document doc=new Document(); + doc.add(Field.Keyword(F_WORD, text)); // orig term + addGram(text, doc, ng1, ng2); + return doc; + } + + + private static void addGram (String text, Document doc, int ng1, int ng2) { + int len=text.length(); + for (int ng=ng1; ng<=ng2; ng++) { + String key="gram"+ng; + String end=null; + for (int i=0; ia.score) { + return 1; + } + if (scorea.freq) { + return 1; + } + + if (freq=cache.length) { + d=form(n, m); + } + else if (cache[m]!=null) { + d=cache[m]; + } + else { + d=cache[m]=form(n, m); + + // Step 3 + + } + for (int i=1; i<=n; i++) { + final char s_i=sa[i-1]; + + // Step 4 + + for (int j=1; j<=m; j++) { + final char t_j=ta[j-1]; + + // Step 5 + + if (s_i==t_j) { // same + cost=0; + } + else { // not a match + cost=1; + + // Step 6 + + } + d[i][j]=min3(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost); + + } + + } + + // Step 7 + return d[n][m]; + + } + + + /** + * + */ + private static int[][] form (int n, int m) { + int[][] d=new int[n+1][m+1]; + // Step 2 + + for (int i=0; i<=n; i++) { + d[i][0]=i; + + } + for (int j=0; j<=m; j++) { + d[0][j]=j; + } + return d; + } + + + //**************************** + // Get minimum of three values + //**************************** + private static int min3 (int a, int b, int c) { + int mi=a; + if (b