prelim checking of spellchecker, v1.1

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151014 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Spencer 2004-11-02 23:11:29 +00:00
parent a8d98638a0
commit 57cd076565
9 changed files with 1091 additions and 0 deletions

View File

@ -0,0 +1,156 @@
<?xml version="1.0" encoding="UTF-8"?>
<project basedir="." default="rebuild" name="Spelling checker">
<property name="lucene.lib" value="d:/dev/lib/lucene.jar"/>
<property name="lucenetest.lib" value="D:/dev/jakarta-lucene/build/classes/test"/>
<property name="name" value="spellchecker"/>
<property name="Name" value="spellchecker"/>
<property name="version" value="1.1"/>
<property name="year" value="2004"/>
<property name="final.name" value="${name}-${version}"/>
<property name="java" location="src/java"/>
<property name="test" location="src/test"/>
<property name="build.dir" location="build"/>
<property name="build.java" location="${build.dir}/classes/java"/>
<property name="build.test" location="${build.dir}/classes/test"/>
<property name="build.javadocs" location="doc"/>
<property name="javadoc.link" value="http://java.sun.com/j2se/1.4/docs/api/"/>
<property name="javac.debug" value="off"/>
<property name="junit.output.dir" location="${build.dir}/test"/>
<property name="junit.reports" location="${build.dir}/test/reports"/>
<!-- Build classpath -->
<path id="classpath">
<pathelement location="${lucene.lib}"/>
<pathelement location="${build.java}"/>
</path>
<path id="test.classpath">
<path refid="classpath"/>
<pathelement location="${lucenetest.lib}"/>
<pathelement location="${build.dir}/classes/test"/>
</path>
<!--Patternset to exclude files from the output directory:-->
<!-- ================================================================== -->
<!-- C O M P I L E -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="javacompile"
description="Compiles core classes">
<mkdir dir="${build.java}"/>
<javac
srcdir="${java}"
includes="**/*.java"
destdir="${build.java}"
debug="${javac.debug}"
optimize="on">
<classpath refid="classpath"/>
</javac>
</target>
<!-- ================================================================== -->
<!-- J A R -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="jar" depends="javacompile" description="Generates the Jar file">
<jar
destfile="${build.dir}/${final.name}.jar"
basedir="${build.java}" />
</target>
<!-- ================================================================== -->
<!-- J A V A D O C -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="javadoc">
<mkdir dir="${build.javadocs}"/>
<javadoc
sourcepath="${java}"
overview="src/java/overview.html"
packagenames="org.apache.lucene.*"
destdir="${build.javadocs}"
author="true"
version="true"
use="true"
link="${javadoc.link}"
windowtitle="${Name} ${version} API"
doctitle="${Name} ${version} API"
bottom="Author: Nicolas Maisonneuve (${year})" >
</javadoc>
</target>
<!-- ================================================================== -->
<!-- C L E A N -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="clean">
<delete failonerror="false" includeemptydirs="true">
<fileset dir="${build.dir}"/>
</delete>
</target>
<!-- ================================================================== -->
<!-- B U I L D T E S T -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="compile-test" depends="javacompile">
<mkdir dir="${build.test}"/>
<javac
srcdir="${test}"
includes="**/*.java"
destdir="${build.test}"
debug="true">
<classpath refid="test.classpath"/>
</javac>
</target>
<!-- ================================================================== -->
<!-- R U N T E S T S -->
<!-- ================================================================== -->
<!-- -->
<!-- ================================================================== -->
<target name="test" depends="compile-test" description="Runs unit tests">
<fail unless="junit.present">
##################################################################
JUnit not found.
Please make sure junit.jar is in ANT_HOME/lib, or made available
to Ant using other mechanisms like -lib or CLASSPATH.
##################################################################
</fail>
<mkdir dir="${junit.output.dir}"/>
<junit printsummary="off" haltonfailure="no"
errorProperty="tests.failed" failureProperty="tests.failed">
<classpath refid="junit.classpath"/>
<sysproperty key="dataDir" file="src/test"/>
<sysproperty key="tempDir" file="${build.dir}/test"/>
<formatter type="xml"/>
<formatter type="brief" usefile="false"/>
<batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
<fileset dir="src/test" includes="**/Test*.java"/>
</batchtest>
<batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
<fileset dir="src/test" includes="**/${testcase}.java"/>
</batchtest>
</junit>
<fail if="tests.failed">Tests failed!</fail>
</target>
<target depends="javacompile" name="make"/>
<target depends="clean,make" name="rebuild"/>
</project>

View File

@ -0,0 +1,33 @@
package org.apache.lucene.search.spell;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Iterator;
/**
* A simple interface representing a Dictionary
* @author Nicolas Maisonneuve
* @version 1.0
*/
public interface Dictionary {
/**
* return all the words present in the dictionnary
* @return Iterator
*/
public Iterator getWordsIterator();
}

View File

@ -0,0 +1,94 @@
package org.apache.lucene.search.spell;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import java.util.Iterator;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.Term;
import java.io.*;
/**
* Lucene Dictionnary
* @author Nicolas Maisonneuve
*/
public class LuceneDictionary
implements Dictionary {
IndexReader reader;
String field;
public LuceneDictionary (IndexReader reader, String field) {
this.reader=reader;
this.field=field;
}
public final Iterator getWordsIterator () {
return new LuceneIterator();
}
final class LuceneIterator implements Iterator {
private TermEnum enum;
private Term actualTerm;
private boolean has_next_called;
public LuceneIterator () {
try {
enum=reader.terms(new Term(field, ""));
}
catch (IOException ex) {
ex.printStackTrace();
}
}
public Object next () {
if (!has_next_called) {hasNext();}
has_next_called=false;
return (actualTerm!=null) ? actualTerm.text(): null;
}
public boolean hasNext () {
has_next_called=true;
try {
// if there is still words
if (!enum.next()) {
actualTerm=null;
return false;
}
// if the next word are in the field
actualTerm=enum.term();
String fieldt=actualTerm.field();
if (fieldt!=field) {
actualTerm=null;
return false;
}
return true;
}
catch (IOException ex) {
ex.printStackTrace();
return false;
}
}
public void remove () {};
}
}

View File

@ -0,0 +1,86 @@
package org.apache.lucene.search.spell;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Iterator;
import java.io.InputStream;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.*;
/**
* dictionary represented by a file text
* Format allowed: 1 word per line:
* word1
* word2
* word3
*
* @author Nicolas Maisonneuve
*/
public class PlainTextDictionary implements Dictionary {
private BufferedReader in;
private String line;
private boolean has_next_called;
public PlainTextDictionary (File file) throws FileNotFoundException {
in=new BufferedReader(new FileReader(file));
}
public PlainTextDictionary (InputStream dictFile) {
in=new BufferedReader(new InputStreamReader(System.in));
}
public Iterator getWordsIterator () {
return new fileIterator();
}
final class fileIterator
implements Iterator {
public Object next () {
if (!has_next_called) {
hasNext();
}
has_next_called=false;
return line;
}
public boolean hasNext () {
has_next_called=true;
try {
line=in.readLine();
}
catch (IOException ex) {
ex.printStackTrace();
line=null;
return false;
}
return (line!=null)?true:false;
}
public void remove () {};
}
}

View File

@ -0,0 +1,363 @@
package org.apache.lucene.search.spell;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import java.util.*;
/**
* <p>
* Spell Checker class (Main class) <br/>
* (initially inspired by the David Spencer code)
* </p>
*
* <p>
* Spell Checker spellchecker= new SpellChecker (spellDirectory);<br/>
* <br/>
* //To index a field of a user index <br/>
* spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));<br/>
*<br/>
* //To index a file containing words <br/>
* spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));<br/>
*</p>
*
* @author Nicolas Maisonneuve
* @version 1.0
*/
public class SpellChecker {
/**
* Field name for each word in the ngram index.
*/
public static final String F_WORD="word";
/**
* the spell index
*/
Directory spellindex;
/**
* Boost value for start and end grams
*/private float bStart=2.0f;
private float bEnd=1.0f;
private IndexReader reader;
float min=0.5f;
public void setSpellIndex (Directory spellindex) {
this.spellindex=spellindex;
}
/**
* Set the accuraty 0<min<1 default 0.5
* @param min float
*/
public void setAccuraty (float min) {
this.min=min;
}
public SpellChecker (Directory gramIndex) {
this.setSpellIndex(gramIndex);
}
/**
* Suggest similar words
* @param word String the word you want a spell check done on
* @param num_sug int the number of suggest words
* @throws IOException
* @return String[]
*/
public String[] suggestSimilar (String word, int num_sug) throws IOException {
return this.suggestSimilar(word, num_sug, null, null, false);
}
/**
* Suggest similar words (restricted or not of a field of a user index)
* @param word String the word you want a spell check done on
* @param num_sug int the number of suggest words
* @param IndexReader the indexReader of the user index (can be null see field param)
* @param field String the field of the user index: if field is not null ,the suggest
* words are restricted to the words present in this field.
* @param morePopular boolean return only the suggest words that are more frequent than the searched word
* (only if restricted mode = (indexReader!=null and field!=null)
* @throws IOException
* @return String[] the sorted list of the suggest words with this 2 criteri
* first criteria : the edit distance, second criteria (only if restricted mode): the popularity
* of the suggest words in the field of the user index
*/
public String[] suggestSimilar (String word, int num_sug, IndexReader ir, String field
, boolean morePopular) throws IOException {
final TRStringDistance sd=new TRStringDistance(word);
final int lengthWord=word.length();
final int goalFreq=(morePopular&&ir!=null)?ir.docFreq(new Term(field, word)):0;
if (!morePopular&&goalFreq>0) {
return new String[] {
word}; // return the word if it exist in the index and i don't want a more popular word
}
BooleanQuery query=new BooleanQuery();
String[] grams;
String key;
for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) {
key="gram"+ng; // form key
grams=formGrams(word, ng); // form word into ngrams (allow dups too)
if (grams.length==0) {
continue; // hmm
}
if (bStart>0) { // should we boost prefixes?
add(query, "start"+ng, grams[0], bStart); // matches start of word
}
if (bEnd>0) { // should we boost suffixes
add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word
}
for (int i=0; i<grams.length; i++) {
add(query, key, grams[i]);
}
}
IndexSearcher searcher=new IndexSearcher(this.spellindex);
Hits hits=searcher.search(query);
SuggestWordQueue sugqueue=new SuggestWordQueue(num_sug);
int stop=Math.min(hits.length(), 10*num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
SuggestWord sugword=new SuggestWord();
for (int i=0; i<stop; i++) {
sugword.string=hits.doc(i).get(F_WORD); // get orig word)
if (sugword.string==word) {
continue; // don't suggest a word for itself, that would be silly
}
//edit distance/normalize with the min word length
sugword.score=1.0f-((float) sd.getDistance(sugword.string)/Math.min(sugword.string.length(), lengthWord));
if (sugword.score<min) {
continue;
}
if (ir!=null) { // use the user index
sugword.freq=ir.docFreq(new Term(field, sugword.string)); // freq in the index
if ((morePopular&&goalFreq>sugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field
continue;
}
}
sugqueue.insert(sugword);
if (sugqueue.size()==num_sug) {
//if queue full , maintain the min score
min=((SuggestWord) sugqueue.top()).score;
}
sugword=new SuggestWord();
}
// convert to array string
String[] list=new String[sugqueue.size()];
for (int i=sugqueue.size()-1; i>=0; i--) {
list[i]=((SuggestWord) sugqueue.pop()).string;
}
searcher.close();
return list;
}
/**
* Add a clause to a boolean query.
*/
private static void add (BooleanQuery q, String k, String v, float boost) {
Query tq=new TermQuery(new Term(k, v));
tq.setBoost(boost);
q.add(new BooleanClause(tq, false, false));
}
/**
* Add a clause to a boolean query.
*/
private static void add (BooleanQuery q, String k, String v) {
q.add(new BooleanClause(new TermQuery(new Term(k, v)), false, false));
}
/**
* Form all ngrams for a given word.
* @param text the word to parse
* @param ng the ngram length e.g. 3
* @return an array of all ngrams in the word and note that duplicates are not removed
*/
private static String[] formGrams (String text, int ng) {
int len=text.length();
String[] res=new String[len-ng+1];
for (int i=0; i<len-ng+1; i++) {
res[i]=text.substring(i, i+ng);
}
return res;
}
public void clearIndex () throws IOException {
IndexReader.unlock(spellindex);
IndexWriter writer=new IndexWriter(spellindex, null, true);
writer.close();
}
/**
* if the word exist in the index
* @param word String
* @throws IOException
* @return boolean
*/
public boolean exist (String word) throws IOException {
if (reader==null) {
reader=IndexReader.open(spellindex);
}
return reader.docFreq(new Term(F_WORD, word))>0;
}
/**
* Index a Dictionnary
* @param dict the dictionnary to index
* @throws IOException
*/
public void indexDictionnary (Dictionary dict) throws IOException {
int ng1, ng2;
IndexReader.unlock(spellindex);
IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex));
writer.mergeFactor=300;
writer.minMergeDocs=150;
Iterator iter=dict.getWordsIterator();
while (iter.hasNext()) {
String word=(String) iter.next();
int len=word.length();
if (len<3) {
continue; // too short we bail but "too long" is fine...
}
if (this.exist(word)) { // if the word already exist in the gramindex
continue;
}
// ok index the word
Document doc=createDocument(word, getMin(len), getMax(len));
writer.addDocument(doc);
}
// close writer
writer.optimize();
writer.close();
// close reader
reader.close();
reader=null;
}
private int getMin (int l) {
if (l>5) {
return 3;
}
if (l==5) {
return 2;
}
return 1;
}
private int getMax (int l) {
if (l>5) {
return 4;
}
if (l==5) {
return 3;
}
return 2;
}
private static Document createDocument (String text, int ng1, int ng2) {
Document doc=new Document();
doc.add(Field.Keyword(F_WORD, text)); // orig term
addGram(text, doc, ng1, ng2);
return doc;
}
private static void addGram (String text, Document doc, int ng1, int ng2) {
int len=text.length();
for (int ng=ng1; ng<=ng2; ng++) {
String key="gram"+ng;
String end=null;
for (int i=0; i<len-ng+1; i++) {
String gram=text.substring(i, i+ng);
doc.add(Field.Keyword(key, gram));
if (i==0) {
doc.add(Field.Keyword("start"+ng, gram));
}
end=gram;
}
if (end!=null) { // may not be present if len==ng1
doc.add(Field.Keyword("end"+ng, end));
}
}
}
protected void finalize () throws Throwable {
if (reader!=null) {
reader.close();
}
}
}

View File

@ -0,0 +1,64 @@
package org.apache.lucene.search.spell;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* SuggestWord Class
* used in suggestSimilat method in SpellChecker class
* @author Nicolas Maisonneuve
*/
final class SuggestWord {
/**
* the score of the word
*/
public float score;
/**
* The freq of the word
*/
public int freq;
/**
* the suggested word
*/
public String string;
public final int compareTo (SuggestWord a) {
//first criteria: the edit distance
if (score>a.score) {
return 1;
}
if (score<a.score) {
return-1;
}
//second criteria (if first criteria is equal): the popularity
if (freq>a.freq) {
return 1;
}
if (freq<a.freq) {
return-1;
}
return 0;
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.search.spell;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* to sort SuggestWord
* @author Nicolas Maisonneuve
*/
import org.apache.lucene.util.PriorityQueue;
final class SuggestWordQueue
extends PriorityQueue {
SuggestWordQueue (int size) {
initialize(size);
}
protected final boolean lessThan (Object a, Object b) {
SuggestWord wa=(SuggestWord) a;
SuggestWord wb=(SuggestWord) b;
int val=wa.compareTo(wb);
return val<0;
}
}

View File

@ -0,0 +1,132 @@
package org.apache.lucene.search.spell;
/**
* Copyright 2002-2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Edit distance class
*/
public final class TRStringDistance {
final char[] sa;
final int n;
final int[][][] cache=new int[30][][];
/**
* Optimized to run a bit faster than the static getDistance().
* In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
*/
public TRStringDistance (String target) {
sa=target.toCharArray();
n=sa.length;
}
//*****************************
// Compute Levenshtein distance
//*****************************
public final int getDistance (String other) {
int d[][]; // matrix
int cost; // cost
// Step 1
final char[] ta=other.toCharArray();
final int m=ta.length;
if (n==0) {
return m;
}
if (m==0) {
return n;
}
if (m>=cache.length) {
d=form(n, m);
}
else if (cache[m]!=null) {
d=cache[m];
}
else {
d=cache[m]=form(n, m);
// Step 3
}
for (int i=1; i<=n; i++) {
final char s_i=sa[i-1];
// Step 4
for (int j=1; j<=m; j++) {
final char t_j=ta[j-1];
// Step 5
if (s_i==t_j) { // same
cost=0;
}
else { // not a match
cost=1;
// Step 6
}
d[i][j]=min3(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost);
}
}
// Step 7
return d[n][m];
}
/**
*
*/
private static int[][] form (int n, int m) {
int[][] d=new int[n+1][m+1];
// Step 2
for (int i=0; i<=n; i++) {
d[i][0]=i;
}
for (int j=0; j<=m; j++) {
d[0][j]=j;
}
return d;
}
//****************************
// Get minimum of three values
//****************************
private static int min3 (int a, int b, int c) {
int mi=a;
if (b<mi) {
mi=b;
}
if (c<mi) {
mi=c;
}
return mi;
}
}

View File

@ -0,0 +1,122 @@
package org.apache.lucene.search.spell;
import junit.framework.*;
import org.apache.lucene.search.spell.*;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.util.English;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import java.io.File;
/**
* Test case
* @author Nicolas Maisonneuve
*/
public class TestSpellChecker
extends TestCase {
private SpellChecker spellChecker;
Directory userindex, spellindex;
protected void setUp () throws Exception {
super.setUp();
//create a user index
userindex=new RAMDirectory();
IndexWriter writer=new IndexWriter(userindex, new SimpleAnalyzer(), true);
for (int i=0; i<1000; i++) {
Document doc=new Document();
doc.add(Field.Text("field1", English.intToEnglish(i)));
doc.add(Field.Text("field2", English.intToEnglish(i+1))); // + word thousand
writer.addDocument(doc);
}
writer.close();
// create the spellChecker
File file=new File("d://test");
spellindex=FSDirectory.getDirectory(file, true);
spellChecker=new SpellChecker(spellindex);
}
public void testBuild () {
try {
IndexReader r=IndexReader.open(userindex);
spellChecker.clearIndex();
addwords(r, "field1");
int num_field1=this.numdoc();
addwords(r, "field2");
int num_field2=this.numdoc();
this.assertTrue(num_field2==num_field1+1);
// test small word
String[] l=spellChecker.suggestSimilar("fvie", 2);
this.assertTrue(l[0].equals("five"));
l=spellChecker.suggestSimilar("fiv", 2);
this.assertTrue(l[0].equals("five"));
l=spellChecker.suggestSimilar("ive", 2);
this.assertTrue(l[0].equals("five"));
l=spellChecker.suggestSimilar("fives", 2);
this.assertTrue(l[0].equals("five"));
l=spellChecker.suggestSimilar("fie", 2);
this.assertTrue(l[0].equals("five"));
l=spellChecker.suggestSimilar("fi", 2);
this.assertEquals(0,l.length);
// test restreint to a field
l=spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
this.assertEquals(0,l.length); // there isn't the term thousand in the field field1
l=spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
this.assertEquals(1,l.length); // there is the term thousand in the field field2
}
catch (IOException e) {
e.printStackTrace();
this.assertTrue(false);
}
}
private void addwords (IndexReader r, String field) throws IOException {
long time=System.currentTimeMillis();
spellChecker.indexDictionnary(new LuceneDictionary(r, field));
time=System.currentTimeMillis()-time;
System.out.println("time to build "+field+": "+time);
}
private int numdoc () throws IOException {
IndexReader rs=IndexReader.open(spellindex);
int num=rs.numDocs();
this.assertTrue(num!=0);
System.out.println("num docs: "+num);
rs.close();
return num;
}
protected void tearDown () throws Exception {
spellChecker=null;
super.tearDown();
}
}