mirror of https://github.com/apache/lucene.git
prelim checking of spellchecker, v1.1
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151014 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a8d98638a0
commit
57cd076565
|
@ -0,0 +1,156 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<project basedir="." default="rebuild" name="Spelling checker">
|
||||
|
||||
<property name="lucene.lib" value="d:/dev/lib/lucene.jar"/>
|
||||
<property name="lucenetest.lib" value="D:/dev/jakarta-lucene/build/classes/test"/>
|
||||
|
||||
|
||||
<property name="name" value="spellchecker"/>
|
||||
<property name="Name" value="spellchecker"/>
|
||||
<property name="version" value="1.1"/>
|
||||
<property name="year" value="2004"/>
|
||||
<property name="final.name" value="${name}-${version}"/>
|
||||
<property name="java" location="src/java"/>
|
||||
<property name="test" location="src/test"/>
|
||||
<property name="build.dir" location="build"/>
|
||||
<property name="build.java" location="${build.dir}/classes/java"/>
|
||||
<property name="build.test" location="${build.dir}/classes/test"/>
|
||||
<property name="build.javadocs" location="doc"/>
|
||||
<property name="javadoc.link" value="http://java.sun.com/j2se/1.4/docs/api/"/>
|
||||
<property name="javac.debug" value="off"/>
|
||||
<property name="junit.output.dir" location="${build.dir}/test"/>
|
||||
<property name="junit.reports" location="${build.dir}/test/reports"/>
|
||||
|
||||
|
||||
|
||||
<!-- Build classpath -->
|
||||
<path id="classpath">
|
||||
<pathelement location="${lucene.lib}"/>
|
||||
|
||||
<pathelement location="${build.java}"/>
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="classpath"/>
|
||||
<pathelement location="${lucenetest.lib}"/>
|
||||
<pathelement location="${build.dir}/classes/test"/>
|
||||
</path>
|
||||
<!--Patternset to exclude files from the output directory:-->
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- C O M P I L E -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="javacompile"
|
||||
description="Compiles core classes">
|
||||
<mkdir dir="${build.java}"/>
|
||||
<javac
|
||||
srcdir="${java}"
|
||||
includes="**/*.java"
|
||||
destdir="${build.java}"
|
||||
debug="${javac.debug}"
|
||||
optimize="on">
|
||||
<classpath refid="classpath"/>
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- J A R -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="jar" depends="javacompile" description="Generates the Jar file">
|
||||
<jar
|
||||
destfile="${build.dir}/${final.name}.jar"
|
||||
basedir="${build.java}" />
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- J A V A D O C -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="javadoc">
|
||||
<mkdir dir="${build.javadocs}"/>
|
||||
<javadoc
|
||||
sourcepath="${java}"
|
||||
overview="src/java/overview.html"
|
||||
packagenames="org.apache.lucene.*"
|
||||
destdir="${build.javadocs}"
|
||||
author="true"
|
||||
version="true"
|
||||
use="true"
|
||||
link="${javadoc.link}"
|
||||
windowtitle="${Name} ${version} API"
|
||||
doctitle="${Name} ${version} API"
|
||||
bottom="Author: Nicolas Maisonneuve (${year})" >
|
||||
</javadoc>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- C L E A N -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="clean">
|
||||
<delete failonerror="false" includeemptydirs="true">
|
||||
<fileset dir="${build.dir}"/>
|
||||
</delete>
|
||||
</target>
|
||||
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- B U I L D T E S T -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="compile-test" depends="javacompile">
|
||||
<mkdir dir="${build.test}"/>
|
||||
<javac
|
||||
srcdir="${test}"
|
||||
includes="**/*.java"
|
||||
destdir="${build.test}"
|
||||
debug="true">
|
||||
<classpath refid="test.classpath"/>
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<!-- ================================================================== -->
|
||||
<!-- R U N T E S T S -->
|
||||
<!-- ================================================================== -->
|
||||
<!-- -->
|
||||
<!-- ================================================================== -->
|
||||
<target name="test" depends="compile-test" description="Runs unit tests">
|
||||
<fail unless="junit.present">
|
||||
##################################################################
|
||||
JUnit not found.
|
||||
Please make sure junit.jar is in ANT_HOME/lib, or made available
|
||||
to Ant using other mechanisms like -lib or CLASSPATH.
|
||||
##################################################################
|
||||
</fail>
|
||||
<mkdir dir="${junit.output.dir}"/>
|
||||
<junit printsummary="off" haltonfailure="no"
|
||||
errorProperty="tests.failed" failureProperty="tests.failed">
|
||||
<classpath refid="junit.classpath"/>
|
||||
<sysproperty key="dataDir" file="src/test"/>
|
||||
<sysproperty key="tempDir" file="${build.dir}/test"/>
|
||||
<formatter type="xml"/>
|
||||
<formatter type="brief" usefile="false"/>
|
||||
<batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
|
||||
<fileset dir="src/test" includes="**/Test*.java"/>
|
||||
</batchtest>
|
||||
<batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
|
||||
<fileset dir="src/test" includes="**/${testcase}.java"/>
|
||||
</batchtest>
|
||||
</junit>
|
||||
|
||||
<fail if="tests.failed">Tests failed!</fail>
|
||||
</target>
|
||||
|
||||
<target depends="javacompile" name="make"/>
|
||||
|
||||
<target depends="clean,make" name="rebuild"/>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,33 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/**
|
||||
* A simple interface representing a Dictionary
|
||||
* @author Nicolas Maisonneuve
|
||||
* @version 1.0
|
||||
*/
|
||||
public interface Dictionary {
|
||||
|
||||
/**
|
||||
* return all the words present in the dictionnary
|
||||
* @return Iterator
|
||||
*/
|
||||
public Iterator getWordsIterator();
|
||||
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import java.util.Iterator;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Lucene Dictionnary
|
||||
* @author Nicolas Maisonneuve
|
||||
*/
|
||||
public class LuceneDictionary
|
||||
implements Dictionary {
|
||||
IndexReader reader;
|
||||
String field;
|
||||
|
||||
public LuceneDictionary (IndexReader reader, String field) {
|
||||
this.reader=reader;
|
||||
this.field=field;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public final Iterator getWordsIterator () {
|
||||
return new LuceneIterator();
|
||||
}
|
||||
|
||||
|
||||
final class LuceneIterator implements Iterator {
|
||||
private TermEnum enum;
|
||||
private Term actualTerm;
|
||||
private boolean has_next_called;
|
||||
|
||||
public LuceneIterator () {
|
||||
try {
|
||||
enum=reader.terms(new Term(field, ""));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public Object next () {
|
||||
if (!has_next_called) {hasNext();}
|
||||
has_next_called=false;
|
||||
return (actualTerm!=null) ? actualTerm.text(): null;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasNext () {
|
||||
has_next_called=true;
|
||||
try {
|
||||
// if there is still words
|
||||
if (!enum.next()) {
|
||||
actualTerm=null;
|
||||
return false;
|
||||
}
|
||||
// if the next word are in the field
|
||||
actualTerm=enum.term();
|
||||
String fieldt=actualTerm.field();
|
||||
if (fieldt!=field) {
|
||||
actualTerm=null;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void remove () {};
|
||||
}
|
||||
}
|
|
@ -0,0 +1,86 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.io.InputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.*;
|
||||
|
||||
|
||||
/**
|
||||
* dictionary represented by a file text
|
||||
* Format allowed: 1 word per line:
|
||||
* word1
|
||||
* word2
|
||||
* word3
|
||||
*
|
||||
* @author Nicolas Maisonneuve
|
||||
*/
|
||||
public class PlainTextDictionary implements Dictionary {
|
||||
|
||||
private BufferedReader in;
|
||||
private String line;
|
||||
private boolean has_next_called;
|
||||
|
||||
public PlainTextDictionary (File file) throws FileNotFoundException {
|
||||
in=new BufferedReader(new FileReader(file));
|
||||
}
|
||||
|
||||
|
||||
public PlainTextDictionary (InputStream dictFile) {
|
||||
in=new BufferedReader(new InputStreamReader(System.in));
|
||||
}
|
||||
|
||||
|
||||
public Iterator getWordsIterator () {
|
||||
|
||||
return new fileIterator();
|
||||
}
|
||||
|
||||
|
||||
final class fileIterator
|
||||
implements Iterator {
|
||||
public Object next () {
|
||||
if (!has_next_called) {
|
||||
hasNext();
|
||||
}
|
||||
has_next_called=false;
|
||||
return line;
|
||||
}
|
||||
|
||||
|
||||
public boolean hasNext () {
|
||||
has_next_called=true;
|
||||
try {
|
||||
line=in.readLine();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
ex.printStackTrace();
|
||||
line=null;
|
||||
return false;
|
||||
}
|
||||
return (line!=null)?true:false;
|
||||
}
|
||||
|
||||
|
||||
public void remove () {};
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,363 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Spell Checker class (Main class) <br/>
|
||||
* (initially inspired by the David Spencer code)
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Spell Checker spellchecker= new SpellChecker (spellDirectory);<br/>
|
||||
* <br/>
|
||||
* //To index a field of a user index <br/>
|
||||
* spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));<br/>
|
||||
*<br/>
|
||||
* //To index a file containing words <br/>
|
||||
* spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));<br/>
|
||||
*</p>
|
||||
*
|
||||
* @author Nicolas Maisonneuve
|
||||
* @version 1.0
|
||||
*/
|
||||
public class SpellChecker {
|
||||
|
||||
/**
|
||||
* Field name for each word in the ngram index.
|
||||
*/
|
||||
public static final String F_WORD="word";
|
||||
|
||||
|
||||
/**
|
||||
* the spell index
|
||||
*/
|
||||
Directory spellindex;
|
||||
|
||||
/**
|
||||
* Boost value for start and end grams
|
||||
*/private float bStart=2.0f;
|
||||
private float bEnd=1.0f;
|
||||
|
||||
|
||||
private IndexReader reader;
|
||||
float min=0.5f;
|
||||
|
||||
public void setSpellIndex (Directory spellindex) {
|
||||
this.spellindex=spellindex;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Set the accuraty 0<min<1 default 0.5
|
||||
* @param min float
|
||||
*/
|
||||
public void setAccuraty (float min) {
|
||||
this.min=min;
|
||||
}
|
||||
|
||||
|
||||
public SpellChecker (Directory gramIndex) {
|
||||
this.setSpellIndex(gramIndex);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Suggest similar words
|
||||
* @param word String the word you want a spell check done on
|
||||
* @param num_sug int the number of suggest words
|
||||
* @throws IOException
|
||||
* @return String[]
|
||||
*/
|
||||
public String[] suggestSimilar (String word, int num_sug) throws IOException {
|
||||
return this.suggestSimilar(word, num_sug, null, null, false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Suggest similar words (restricted or not of a field of a user index)
|
||||
* @param word String the word you want a spell check done on
|
||||
* @param num_sug int the number of suggest words
|
||||
* @param IndexReader the indexReader of the user index (can be null see field param)
|
||||
* @param field String the field of the user index: if field is not null ,the suggest
|
||||
* words are restricted to the words present in this field.
|
||||
* @param morePopular boolean return only the suggest words that are more frequent than the searched word
|
||||
* (only if restricted mode = (indexReader!=null and field!=null)
|
||||
* @throws IOException
|
||||
* @return String[] the sorted list of the suggest words with this 2 criteri
|
||||
* first criteria : the edit distance, second criteria (only if restricted mode): the popularity
|
||||
* of the suggest words in the field of the user index
|
||||
*/
|
||||
public String[] suggestSimilar (String word, int num_sug, IndexReader ir, String field
|
||||
, boolean morePopular) throws IOException {
|
||||
|
||||
final TRStringDistance sd=new TRStringDistance(word);
|
||||
final int lengthWord=word.length();
|
||||
|
||||
final int goalFreq=(morePopular&&ir!=null)?ir.docFreq(new Term(field, word)):0;
|
||||
if (!morePopular&&goalFreq>0) {
|
||||
return new String[] {
|
||||
word}; // return the word if it exist in the index and i don't want a more popular word
|
||||
}
|
||||
|
||||
BooleanQuery query=new BooleanQuery();
|
||||
String[] grams;
|
||||
String key;
|
||||
|
||||
for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) {
|
||||
|
||||
key="gram"+ng; // form key
|
||||
|
||||
grams=formGrams(word, ng); // form word into ngrams (allow dups too)
|
||||
|
||||
if (grams.length==0) {
|
||||
continue; // hmm
|
||||
}
|
||||
|
||||
if (bStart>0) { // should we boost prefixes?
|
||||
add(query, "start"+ng, grams[0], bStart); // matches start of word
|
||||
|
||||
}
|
||||
if (bEnd>0) { // should we boost suffixes
|
||||
add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word
|
||||
|
||||
}
|
||||
for (int i=0; i<grams.length; i++) {
|
||||
add(query, key, grams[i]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
IndexSearcher searcher=new IndexSearcher(this.spellindex);
|
||||
Hits hits=searcher.search(query);
|
||||
SuggestWordQueue sugqueue=new SuggestWordQueue(num_sug);
|
||||
|
||||
int stop=Math.min(hits.length(), 10*num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
|
||||
SuggestWord sugword=new SuggestWord();
|
||||
for (int i=0; i<stop; i++) {
|
||||
|
||||
sugword.string=hits.doc(i).get(F_WORD); // get orig word)
|
||||
|
||||
if (sugword.string==word) {
|
||||
continue; // don't suggest a word for itself, that would be silly
|
||||
}
|
||||
|
||||
//edit distance/normalize with the min word length
|
||||
sugword.score=1.0f-((float) sd.getDistance(sugword.string)/Math.min(sugword.string.length(), lengthWord));
|
||||
if (sugword.score<min) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ir!=null) { // use the user index
|
||||
sugword.freq=ir.docFreq(new Term(field, sugword.string)); // freq in the index
|
||||
if ((morePopular&&goalFreq>sugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field
|
||||
continue;
|
||||
}
|
||||
}
|
||||
sugqueue.insert(sugword);
|
||||
if (sugqueue.size()==num_sug) {
|
||||
//if queue full , maintain the min score
|
||||
min=((SuggestWord) sugqueue.top()).score;
|
||||
}
|
||||
sugword=new SuggestWord();
|
||||
}
|
||||
|
||||
// convert to array string
|
||||
String[] list=new String[sugqueue.size()];
|
||||
for (int i=sugqueue.size()-1; i>=0; i--) {
|
||||
list[i]=((SuggestWord) sugqueue.pop()).string;
|
||||
}
|
||||
|
||||
searcher.close();
|
||||
return list;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add a clause to a boolean query.
|
||||
*/
|
||||
private static void add (BooleanQuery q, String k, String v, float boost) {
|
||||
Query tq=new TermQuery(new Term(k, v));
|
||||
tq.setBoost(boost);
|
||||
q.add(new BooleanClause(tq, false, false));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Add a clause to a boolean query.
|
||||
*/
|
||||
private static void add (BooleanQuery q, String k, String v) {
|
||||
q.add(new BooleanClause(new TermQuery(new Term(k, v)), false, false));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Form all ngrams for a given word.
|
||||
* @param text the word to parse
|
||||
* @param ng the ngram length e.g. 3
|
||||
* @return an array of all ngrams in the word and note that duplicates are not removed
|
||||
*/
|
||||
private static String[] formGrams (String text, int ng) {
|
||||
int len=text.length();
|
||||
String[] res=new String[len-ng+1];
|
||||
for (int i=0; i<len-ng+1; i++) {
|
||||
res[i]=text.substring(i, i+ng);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
public void clearIndex () throws IOException {
|
||||
IndexReader.unlock(spellindex);
|
||||
IndexWriter writer=new IndexWriter(spellindex, null, true);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* if the word exist in the index
|
||||
* @param word String
|
||||
* @throws IOException
|
||||
* @return boolean
|
||||
*/
|
||||
public boolean exist (String word) throws IOException {
|
||||
if (reader==null) {
|
||||
reader=IndexReader.open(spellindex);
|
||||
}
|
||||
return reader.docFreq(new Term(F_WORD, word))>0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Index a Dictionnary
|
||||
* @param dict the dictionnary to index
|
||||
* @throws IOException
|
||||
*/
|
||||
public void indexDictionnary (Dictionary dict) throws IOException {
|
||||
|
||||
int ng1, ng2;
|
||||
IndexReader.unlock(spellindex);
|
||||
IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex));
|
||||
writer.mergeFactor=300;
|
||||
writer.minMergeDocs=150;
|
||||
|
||||
Iterator iter=dict.getWordsIterator();
|
||||
while (iter.hasNext()) {
|
||||
String word=(String) iter.next();
|
||||
|
||||
int len=word.length();
|
||||
if (len<3) {
|
||||
continue; // too short we bail but "too long" is fine...
|
||||
}
|
||||
|
||||
if (this.exist(word)) { // if the word already exist in the gramindex
|
||||
continue;
|
||||
}
|
||||
|
||||
// ok index the word
|
||||
Document doc=createDocument(word, getMin(len), getMax(len));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
// close writer
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
// close reader
|
||||
reader.close();
|
||||
reader=null;
|
||||
}
|
||||
|
||||
|
||||
private int getMin (int l) {
|
||||
if (l>5) {
|
||||
return 3;
|
||||
}
|
||||
if (l==5) {
|
||||
return 2;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
private int getMax (int l) {
|
||||
if (l>5) {
|
||||
return 4;
|
||||
}
|
||||
if (l==5) {
|
||||
return 3;
|
||||
}
|
||||
return 2;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Document createDocument (String text, int ng1, int ng2) {
|
||||
Document doc=new Document();
|
||||
doc.add(Field.Keyword(F_WORD, text)); // orig term
|
||||
addGram(text, doc, ng1, ng2);
|
||||
return doc;
|
||||
}
|
||||
|
||||
|
||||
private static void addGram (String text, Document doc, int ng1, int ng2) {
|
||||
int len=text.length();
|
||||
for (int ng=ng1; ng<=ng2; ng++) {
|
||||
String key="gram"+ng;
|
||||
String end=null;
|
||||
for (int i=0; i<len-ng+1; i++) {
|
||||
String gram=text.substring(i, i+ng);
|
||||
doc.add(Field.Keyword(key, gram));
|
||||
if (i==0) {
|
||||
doc.add(Field.Keyword("start"+ng, gram));
|
||||
}
|
||||
end=gram;
|
||||
}
|
||||
if (end!=null) { // may not be present if len==ng1
|
||||
doc.add(Field.Keyword("end"+ng, end));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
protected void finalize () throws Throwable {
|
||||
if (reader!=null) {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* SuggestWord Class
|
||||
* used in suggestSimilat method in SpellChecker class
|
||||
* @author Nicolas Maisonneuve
|
||||
*/
|
||||
final class SuggestWord {
|
||||
/**
|
||||
* the score of the word
|
||||
*/
|
||||
public float score;
|
||||
|
||||
|
||||
/**
|
||||
* The freq of the word
|
||||
*/
|
||||
public int freq;
|
||||
|
||||
|
||||
/**
|
||||
* the suggested word
|
||||
*/
|
||||
public String string;
|
||||
|
||||
|
||||
public final int compareTo (SuggestWord a) {
|
||||
//first criteria: the edit distance
|
||||
if (score>a.score) {
|
||||
return 1;
|
||||
}
|
||||
if (score<a.score) {
|
||||
return-1;
|
||||
}
|
||||
|
||||
//second criteria (if first criteria is equal): the popularity
|
||||
if (freq>a.freq) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (freq<a.freq) {
|
||||
return-1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* to sort SuggestWord
|
||||
* @author Nicolas Maisonneuve
|
||||
*/
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
|
||||
final class SuggestWordQueue
|
||||
extends PriorityQueue {
|
||||
|
||||
SuggestWordQueue (int size) {
|
||||
initialize(size);
|
||||
}
|
||||
|
||||
protected final boolean lessThan (Object a, Object b) {
|
||||
SuggestWord wa=(SuggestWord) a;
|
||||
SuggestWord wb=(SuggestWord) b;
|
||||
int val=wa.compareTo(wb);
|
||||
return val<0;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,132 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
|
||||
/**
|
||||
* Copyright 2002-2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Edit distance class
|
||||
*/
|
||||
public final class TRStringDistance {
|
||||
|
||||
final char[] sa;
|
||||
final int n;
|
||||
final int[][][] cache=new int[30][][];
|
||||
|
||||
|
||||
/**
|
||||
* Optimized to run a bit faster than the static getDistance().
|
||||
* In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
|
||||
*/
|
||||
public TRStringDistance (String target) {
|
||||
sa=target.toCharArray();
|
||||
n=sa.length;
|
||||
}
|
||||
|
||||
|
||||
//*****************************
|
||||
// Compute Levenshtein distance
|
||||
//*****************************
|
||||
public final int getDistance (String other) {
|
||||
int d[][]; // matrix
|
||||
int cost; // cost
|
||||
|
||||
// Step 1
|
||||
final char[] ta=other.toCharArray();
|
||||
final int m=ta.length;
|
||||
if (n==0) {
|
||||
return m;
|
||||
}
|
||||
if (m==0) {
|
||||
return n;
|
||||
}
|
||||
|
||||
if (m>=cache.length) {
|
||||
d=form(n, m);
|
||||
}
|
||||
else if (cache[m]!=null) {
|
||||
d=cache[m];
|
||||
}
|
||||
else {
|
||||
d=cache[m]=form(n, m);
|
||||
|
||||
// Step 3
|
||||
|
||||
}
|
||||
for (int i=1; i<=n; i++) {
|
||||
final char s_i=sa[i-1];
|
||||
|
||||
// Step 4
|
||||
|
||||
for (int j=1; j<=m; j++) {
|
||||
final char t_j=ta[j-1];
|
||||
|
||||
// Step 5
|
||||
|
||||
if (s_i==t_j) { // same
|
||||
cost=0;
|
||||
}
|
||||
else { // not a match
|
||||
cost=1;
|
||||
|
||||
// Step 6
|
||||
|
||||
}
|
||||
d[i][j]=min3(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Step 7
|
||||
return d[n][m];
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static int[][] form (int n, int m) {
|
||||
int[][] d=new int[n+1][m+1];
|
||||
// Step 2
|
||||
|
||||
for (int i=0; i<=n; i++) {
|
||||
d[i][0]=i;
|
||||
|
||||
}
|
||||
for (int j=0; j<=m; j++) {
|
||||
d[0][j]=j;
|
||||
}
|
||||
return d;
|
||||
}
|
||||
|
||||
|
||||
//****************************
|
||||
// Get minimum of three values
|
||||
//****************************
|
||||
private static int min3 (int a, int b, int c) {
|
||||
int mi=a;
|
||||
if (b<mi) {
|
||||
mi=b;
|
||||
}
|
||||
if (c<mi) {
|
||||
mi=c;
|
||||
}
|
||||
return mi;
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
|
||||
import junit.framework.*;
|
||||
import org.apache.lucene.search.spell.*;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.util.English;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import java.io.IOException;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import java.io.File;
|
||||
|
||||
|
||||
/**
|
||||
* Test case
|
||||
* @author Nicolas Maisonneuve
|
||||
*/
|
||||
|
||||
public class TestSpellChecker
|
||||
extends TestCase {
|
||||
private SpellChecker spellChecker;
|
||||
Directory userindex, spellindex;
|
||||
|
||||
protected void setUp () throws Exception {
|
||||
super.setUp();
|
||||
|
||||
//create a user index
|
||||
userindex=new RAMDirectory();
|
||||
IndexWriter writer=new IndexWriter(userindex, new SimpleAnalyzer(), true);
|
||||
|
||||
for (int i=0; i<1000; i++) {
|
||||
Document doc=new Document();
|
||||
doc.add(Field.Text("field1", English.intToEnglish(i)));
|
||||
doc.add(Field.Text("field2", English.intToEnglish(i+1))); // + word thousand
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
// create the spellChecker
|
||||
File file=new File("d://test");
|
||||
spellindex=FSDirectory.getDirectory(file, true);
|
||||
spellChecker=new SpellChecker(spellindex);
|
||||
}
|
||||
|
||||
|
||||
public void testBuild () {
|
||||
try {
|
||||
IndexReader r=IndexReader.open(userindex);
|
||||
|
||||
spellChecker.clearIndex();
|
||||
|
||||
addwords(r, "field1");
|
||||
int num_field1=this.numdoc();
|
||||
|
||||
addwords(r, "field2");
|
||||
int num_field2=this.numdoc();
|
||||
|
||||
this.assertTrue(num_field2==num_field1+1);
|
||||
|
||||
// test small word
|
||||
String[] l=spellChecker.suggestSimilar("fvie", 2);
|
||||
this.assertTrue(l[0].equals("five"));
|
||||
|
||||
l=spellChecker.suggestSimilar("fiv", 2);
|
||||
this.assertTrue(l[0].equals("five"));
|
||||
|
||||
l=spellChecker.suggestSimilar("ive", 2);
|
||||
this.assertTrue(l[0].equals("five"));
|
||||
|
||||
l=spellChecker.suggestSimilar("fives", 2);
|
||||
this.assertTrue(l[0].equals("five"));
|
||||
|
||||
l=spellChecker.suggestSimilar("fie", 2);
|
||||
this.assertTrue(l[0].equals("five"));
|
||||
|
||||
l=spellChecker.suggestSimilar("fi", 2);
|
||||
this.assertEquals(0,l.length);
|
||||
|
||||
// test restreint to a field
|
||||
l=spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
|
||||
this.assertEquals(0,l.length); // there isn't the term thousand in the field field1
|
||||
|
||||
l=spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
|
||||
this.assertEquals(1,l.length); // there is the term thousand in the field field2
|
||||
}
|
||||
catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
this.assertTrue(false);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addwords (IndexReader r, String field) throws IOException {
|
||||
long time=System.currentTimeMillis();
|
||||
spellChecker.indexDictionnary(new LuceneDictionary(r, field));
|
||||
time=System.currentTimeMillis()-time;
|
||||
System.out.println("time to build "+field+": "+time);
|
||||
}
|
||||
|
||||
|
||||
private int numdoc () throws IOException {
|
||||
IndexReader rs=IndexReader.open(spellindex);
|
||||
int num=rs.numDocs();
|
||||
this.assertTrue(num!=0);
|
||||
System.out.println("num docs: "+num);
|
||||
rs.close();
|
||||
return num;
|
||||
}
|
||||
|
||||
|
||||
protected void tearDown () throws Exception {
|
||||
spellChecker=null;
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue