mirror of https://github.com/apache/lucene.git
prelim checking of spellchecker, v1.1
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@151014 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a8d98638a0
commit
57cd076565
|
@ -0,0 +1,156 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
||||||
|
<project basedir="." default="rebuild" name="Spelling checker">
|
||||||
|
|
||||||
|
<property name="lucene.lib" value="d:/dev/lib/lucene.jar"/>
|
||||||
|
<property name="lucenetest.lib" value="D:/dev/jakarta-lucene/build/classes/test"/>
|
||||||
|
|
||||||
|
|
||||||
|
<property name="name" value="spellchecker"/>
|
||||||
|
<property name="Name" value="spellchecker"/>
|
||||||
|
<property name="version" value="1.1"/>
|
||||||
|
<property name="year" value="2004"/>
|
||||||
|
<property name="final.name" value="${name}-${version}"/>
|
||||||
|
<property name="java" location="src/java"/>
|
||||||
|
<property name="test" location="src/test"/>
|
||||||
|
<property name="build.dir" location="build"/>
|
||||||
|
<property name="build.java" location="${build.dir}/classes/java"/>
|
||||||
|
<property name="build.test" location="${build.dir}/classes/test"/>
|
||||||
|
<property name="build.javadocs" location="doc"/>
|
||||||
|
<property name="javadoc.link" value="http://java.sun.com/j2se/1.4/docs/api/"/>
|
||||||
|
<property name="javac.debug" value="off"/>
|
||||||
|
<property name="junit.output.dir" location="${build.dir}/test"/>
|
||||||
|
<property name="junit.reports" location="${build.dir}/test/reports"/>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Build classpath -->
|
||||||
|
<path id="classpath">
|
||||||
|
<pathelement location="${lucene.lib}"/>
|
||||||
|
|
||||||
|
<pathelement location="${build.java}"/>
|
||||||
|
</path>
|
||||||
|
|
||||||
|
<path id="test.classpath">
|
||||||
|
<path refid="classpath"/>
|
||||||
|
<pathelement location="${lucenetest.lib}"/>
|
||||||
|
<pathelement location="${build.dir}/classes/test"/>
|
||||||
|
</path>
|
||||||
|
<!--Patternset to exclude files from the output directory:-->
|
||||||
|
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- C O M P I L E -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<target name="javacompile"
|
||||||
|
description="Compiles core classes">
|
||||||
|
<mkdir dir="${build.java}"/>
|
||||||
|
<javac
|
||||||
|
srcdir="${java}"
|
||||||
|
includes="**/*.java"
|
||||||
|
destdir="${build.java}"
|
||||||
|
debug="${javac.debug}"
|
||||||
|
optimize="on">
|
||||||
|
<classpath refid="classpath"/>
|
||||||
|
</javac>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- J A R -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<target name="jar" depends="javacompile" description="Generates the Jar file">
|
||||||
|
<jar
|
||||||
|
destfile="${build.dir}/${final.name}.jar"
|
||||||
|
basedir="${build.java}" />
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- J A V A D O C -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<target name="javadoc">
|
||||||
|
<mkdir dir="${build.javadocs}"/>
|
||||||
|
<javadoc
|
||||||
|
sourcepath="${java}"
|
||||||
|
overview="src/java/overview.html"
|
||||||
|
packagenames="org.apache.lucene.*"
|
||||||
|
destdir="${build.javadocs}"
|
||||||
|
author="true"
|
||||||
|
version="true"
|
||||||
|
use="true"
|
||||||
|
link="${javadoc.link}"
|
||||||
|
windowtitle="${Name} ${version} API"
|
||||||
|
doctitle="${Name} ${version} API"
|
||||||
|
bottom="Author: Nicolas Maisonneuve (${year})" >
|
||||||
|
</javadoc>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- C L E A N -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<target name="clean">
|
||||||
|
<delete failonerror="false" includeemptydirs="true">
|
||||||
|
<fileset dir="${build.dir}"/>
|
||||||
|
</delete>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- B U I L D T E S T -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<target name="compile-test" depends="javacompile">
|
||||||
|
<mkdir dir="${build.test}"/>
|
||||||
|
<javac
|
||||||
|
srcdir="${test}"
|
||||||
|
includes="**/*.java"
|
||||||
|
destdir="${build.test}"
|
||||||
|
debug="true">
|
||||||
|
<classpath refid="test.classpath"/>
|
||||||
|
</javac>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- R U N T E S T S -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<!-- -->
|
||||||
|
<!-- ================================================================== -->
|
||||||
|
<target name="test" depends="compile-test" description="Runs unit tests">
|
||||||
|
<fail unless="junit.present">
|
||||||
|
##################################################################
|
||||||
|
JUnit not found.
|
||||||
|
Please make sure junit.jar is in ANT_HOME/lib, or made available
|
||||||
|
to Ant using other mechanisms like -lib or CLASSPATH.
|
||||||
|
##################################################################
|
||||||
|
</fail>
|
||||||
|
<mkdir dir="${junit.output.dir}"/>
|
||||||
|
<junit printsummary="off" haltonfailure="no"
|
||||||
|
errorProperty="tests.failed" failureProperty="tests.failed">
|
||||||
|
<classpath refid="junit.classpath"/>
|
||||||
|
<sysproperty key="dataDir" file="src/test"/>
|
||||||
|
<sysproperty key="tempDir" file="${build.dir}/test"/>
|
||||||
|
<formatter type="xml"/>
|
||||||
|
<formatter type="brief" usefile="false"/>
|
||||||
|
<batchtest fork="yes" todir="${junit.output.dir}" unless="testcase">
|
||||||
|
<fileset dir="src/test" includes="**/Test*.java"/>
|
||||||
|
</batchtest>
|
||||||
|
<batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
|
||||||
|
<fileset dir="src/test" includes="**/${testcase}.java"/>
|
||||||
|
</batchtest>
|
||||||
|
</junit>
|
||||||
|
|
||||||
|
<fail if="tests.failed">Tests failed!</fail>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target depends="javacompile" name="make"/>
|
||||||
|
|
||||||
|
<target depends="clean,make" name="rebuild"/>
|
||||||
|
|
||||||
|
</project>
|
|
@ -0,0 +1,33 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
/**
|
||||||
|
* Copyright 2002-2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A simple interface representing a Dictionary
|
||||||
|
* @author Nicolas Maisonneuve
|
||||||
|
* @version 1.0
|
||||||
|
*/
|
||||||
|
public interface Dictionary {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return all the words present in the dictionnary
|
||||||
|
* @return Iterator
|
||||||
|
*/
|
||||||
|
public Iterator getWordsIterator();
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,94 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2002-2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lucene Dictionnary
|
||||||
|
* @author Nicolas Maisonneuve
|
||||||
|
*/
|
||||||
|
public class LuceneDictionary
|
||||||
|
implements Dictionary {
|
||||||
|
IndexReader reader;
|
||||||
|
String field;
|
||||||
|
|
||||||
|
public LuceneDictionary (IndexReader reader, String field) {
|
||||||
|
this.reader=reader;
|
||||||
|
this.field=field;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public final Iterator getWordsIterator () {
|
||||||
|
return new LuceneIterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
final class LuceneIterator implements Iterator {
|
||||||
|
private TermEnum enum;
|
||||||
|
private Term actualTerm;
|
||||||
|
private boolean has_next_called;
|
||||||
|
|
||||||
|
public LuceneIterator () {
|
||||||
|
try {
|
||||||
|
enum=reader.terms(new Term(field, ""));
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Object next () {
|
||||||
|
if (!has_next_called) {hasNext();}
|
||||||
|
has_next_called=false;
|
||||||
|
return (actualTerm!=null) ? actualTerm.text(): null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasNext () {
|
||||||
|
has_next_called=true;
|
||||||
|
try {
|
||||||
|
// if there is still words
|
||||||
|
if (!enum.next()) {
|
||||||
|
actualTerm=null;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// if the next word are in the field
|
||||||
|
actualTerm=enum.term();
|
||||||
|
String fieldt=actualTerm.field();
|
||||||
|
if (fieldt!=field) {
|
||||||
|
actualTerm=null;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void remove () {};
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,86 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2002-2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* dictionary represented by a file text
|
||||||
|
* Format allowed: 1 word per line:
|
||||||
|
* word1
|
||||||
|
* word2
|
||||||
|
* word3
|
||||||
|
*
|
||||||
|
* @author Nicolas Maisonneuve
|
||||||
|
*/
|
||||||
|
public class PlainTextDictionary implements Dictionary {
|
||||||
|
|
||||||
|
private BufferedReader in;
|
||||||
|
private String line;
|
||||||
|
private boolean has_next_called;
|
||||||
|
|
||||||
|
public PlainTextDictionary (File file) throws FileNotFoundException {
|
||||||
|
in=new BufferedReader(new FileReader(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public PlainTextDictionary (InputStream dictFile) {
|
||||||
|
in=new BufferedReader(new InputStreamReader(System.in));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Iterator getWordsIterator () {
|
||||||
|
|
||||||
|
return new fileIterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
final class fileIterator
|
||||||
|
implements Iterator {
|
||||||
|
public Object next () {
|
||||||
|
if (!has_next_called) {
|
||||||
|
hasNext();
|
||||||
|
}
|
||||||
|
has_next_called=false;
|
||||||
|
return line;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasNext () {
|
||||||
|
has_next_called=true;
|
||||||
|
try {
|
||||||
|
line=in.readLine();
|
||||||
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
line=null;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return (line!=null)?true:false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void remove () {};
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,363 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2002-2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.Hits;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* <p>
|
||||||
|
* Spell Checker class (Main class) <br/>
|
||||||
|
* (initially inspired by the David Spencer code)
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Spell Checker spellchecker= new SpellChecker (spellDirectory);<br/>
|
||||||
|
* <br/>
|
||||||
|
* //To index a field of a user index <br/>
|
||||||
|
* spellchecker.indexDictionary(new LuceneDictionary(my_lucene_reader, a_field));<br/>
|
||||||
|
*<br/>
|
||||||
|
* //To index a file containing words <br/>
|
||||||
|
* spellchecker.indexDictionary(new PlainTextDictionary(new File("myfile.txt")));<br/>
|
||||||
|
*</p>
|
||||||
|
*
|
||||||
|
* @author Nicolas Maisonneuve
|
||||||
|
* @version 1.0
|
||||||
|
*/
|
||||||
|
public class SpellChecker {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Field name for each word in the ngram index.
|
||||||
|
*/
|
||||||
|
public static final String F_WORD="word";
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the spell index
|
||||||
|
*/
|
||||||
|
Directory spellindex;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Boost value for start and end grams
|
||||||
|
*/private float bStart=2.0f;
|
||||||
|
private float bEnd=1.0f;
|
||||||
|
|
||||||
|
|
||||||
|
private IndexReader reader;
|
||||||
|
float min=0.5f;
|
||||||
|
|
||||||
|
public void setSpellIndex (Directory spellindex) {
|
||||||
|
this.spellindex=spellindex;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set the accuraty 0<min<1 default 0.5
|
||||||
|
* @param min float
|
||||||
|
*/
|
||||||
|
public void setAccuraty (float min) {
|
||||||
|
this.min=min;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public SpellChecker (Directory gramIndex) {
|
||||||
|
this.setSpellIndex(gramIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Suggest similar words
|
||||||
|
* @param word String the word you want a spell check done on
|
||||||
|
* @param num_sug int the number of suggest words
|
||||||
|
* @throws IOException
|
||||||
|
* @return String[]
|
||||||
|
*/
|
||||||
|
public String[] suggestSimilar (String word, int num_sug) throws IOException {
|
||||||
|
return this.suggestSimilar(word, num_sug, null, null, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Suggest similar words (restricted or not of a field of a user index)
|
||||||
|
* @param word String the word you want a spell check done on
|
||||||
|
* @param num_sug int the number of suggest words
|
||||||
|
* @param IndexReader the indexReader of the user index (can be null see field param)
|
||||||
|
* @param field String the field of the user index: if field is not null ,the suggest
|
||||||
|
* words are restricted to the words present in this field.
|
||||||
|
* @param morePopular boolean return only the suggest words that are more frequent than the searched word
|
||||||
|
* (only if restricted mode = (indexReader!=null and field!=null)
|
||||||
|
* @throws IOException
|
||||||
|
* @return String[] the sorted list of the suggest words with this 2 criteri
|
||||||
|
* first criteria : the edit distance, second criteria (only if restricted mode): the popularity
|
||||||
|
* of the suggest words in the field of the user index
|
||||||
|
*/
|
||||||
|
public String[] suggestSimilar (String word, int num_sug, IndexReader ir, String field
|
||||||
|
, boolean morePopular) throws IOException {
|
||||||
|
|
||||||
|
final TRStringDistance sd=new TRStringDistance(word);
|
||||||
|
final int lengthWord=word.length();
|
||||||
|
|
||||||
|
final int goalFreq=(morePopular&&ir!=null)?ir.docFreq(new Term(field, word)):0;
|
||||||
|
if (!morePopular&&goalFreq>0) {
|
||||||
|
return new String[] {
|
||||||
|
word}; // return the word if it exist in the index and i don't want a more popular word
|
||||||
|
}
|
||||||
|
|
||||||
|
BooleanQuery query=new BooleanQuery();
|
||||||
|
String[] grams;
|
||||||
|
String key;
|
||||||
|
|
||||||
|
for (int ng=getMin(lengthWord); ng<=getMax(lengthWord); ng++) {
|
||||||
|
|
||||||
|
key="gram"+ng; // form key
|
||||||
|
|
||||||
|
grams=formGrams(word, ng); // form word into ngrams (allow dups too)
|
||||||
|
|
||||||
|
if (grams.length==0) {
|
||||||
|
continue; // hmm
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bStart>0) { // should we boost prefixes?
|
||||||
|
add(query, "start"+ng, grams[0], bStart); // matches start of word
|
||||||
|
|
||||||
|
}
|
||||||
|
if (bEnd>0) { // should we boost suffixes
|
||||||
|
add(query, "end"+ng, grams[grams.length-1], bEnd); // matches end of word
|
||||||
|
|
||||||
|
}
|
||||||
|
for (int i=0; i<grams.length; i++) {
|
||||||
|
add(query, key, grams[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexSearcher searcher=new IndexSearcher(this.spellindex);
|
||||||
|
Hits hits=searcher.search(query);
|
||||||
|
SuggestWordQueue sugqueue=new SuggestWordQueue(num_sug);
|
||||||
|
|
||||||
|
int stop=Math.min(hits.length(), 10*num_sug); // go thru more than 'maxr' matches in case the distance filter triggers
|
||||||
|
SuggestWord sugword=new SuggestWord();
|
||||||
|
for (int i=0; i<stop; i++) {
|
||||||
|
|
||||||
|
sugword.string=hits.doc(i).get(F_WORD); // get orig word)
|
||||||
|
|
||||||
|
if (sugword.string==word) {
|
||||||
|
continue; // don't suggest a word for itself, that would be silly
|
||||||
|
}
|
||||||
|
|
||||||
|
//edit distance/normalize with the min word length
|
||||||
|
sugword.score=1.0f-((float) sd.getDistance(sugword.string)/Math.min(sugword.string.length(), lengthWord));
|
||||||
|
if (sugword.score<min) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ir!=null) { // use the user index
|
||||||
|
sugword.freq=ir.docFreq(new Term(field, sugword.string)); // freq in the index
|
||||||
|
if ((morePopular&&goalFreq>sugword.freq)||sugword.freq<1) { // don't suggest a word that is not present in the field
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sugqueue.insert(sugword);
|
||||||
|
if (sugqueue.size()==num_sug) {
|
||||||
|
//if queue full , maintain the min score
|
||||||
|
min=((SuggestWord) sugqueue.top()).score;
|
||||||
|
}
|
||||||
|
sugword=new SuggestWord();
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert to array string
|
||||||
|
String[] list=new String[sugqueue.size()];
|
||||||
|
for (int i=sugqueue.size()-1; i>=0; i--) {
|
||||||
|
list[i]=((SuggestWord) sugqueue.pop()).string;
|
||||||
|
}
|
||||||
|
|
||||||
|
searcher.close();
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a clause to a boolean query.
|
||||||
|
*/
|
||||||
|
private static void add (BooleanQuery q, String k, String v, float boost) {
|
||||||
|
Query tq=new TermQuery(new Term(k, v));
|
||||||
|
tq.setBoost(boost);
|
||||||
|
q.add(new BooleanClause(tq, false, false));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add a clause to a boolean query.
|
||||||
|
*/
|
||||||
|
private static void add (BooleanQuery q, String k, String v) {
|
||||||
|
q.add(new BooleanClause(new TermQuery(new Term(k, v)), false, false));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Form all ngrams for a given word.
|
||||||
|
* @param text the word to parse
|
||||||
|
* @param ng the ngram length e.g. 3
|
||||||
|
* @return an array of all ngrams in the word and note that duplicates are not removed
|
||||||
|
*/
|
||||||
|
private static String[] formGrams (String text, int ng) {
|
||||||
|
int len=text.length();
|
||||||
|
String[] res=new String[len-ng+1];
|
||||||
|
for (int i=0; i<len-ng+1; i++) {
|
||||||
|
res[i]=text.substring(i, i+ng);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void clearIndex () throws IOException {
|
||||||
|
IndexReader.unlock(spellindex);
|
||||||
|
IndexWriter writer=new IndexWriter(spellindex, null, true);
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* if the word exist in the index
|
||||||
|
* @param word String
|
||||||
|
* @throws IOException
|
||||||
|
* @return boolean
|
||||||
|
*/
|
||||||
|
public boolean exist (String word) throws IOException {
|
||||||
|
if (reader==null) {
|
||||||
|
reader=IndexReader.open(spellindex);
|
||||||
|
}
|
||||||
|
return reader.docFreq(new Term(F_WORD, word))>0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Index a Dictionnary
|
||||||
|
* @param dict the dictionnary to index
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public void indexDictionnary (Dictionary dict) throws IOException {
|
||||||
|
|
||||||
|
int ng1, ng2;
|
||||||
|
IndexReader.unlock(spellindex);
|
||||||
|
IndexWriter writer=new IndexWriter(spellindex, new WhitespaceAnalyzer(), !IndexReader.indexExists(spellindex));
|
||||||
|
writer.mergeFactor=300;
|
||||||
|
writer.minMergeDocs=150;
|
||||||
|
|
||||||
|
Iterator iter=dict.getWordsIterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
String word=(String) iter.next();
|
||||||
|
|
||||||
|
int len=word.length();
|
||||||
|
if (len<3) {
|
||||||
|
continue; // too short we bail but "too long" is fine...
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.exist(word)) { // if the word already exist in the gramindex
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ok index the word
|
||||||
|
Document doc=createDocument(word, getMin(len), getMax(len));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
// close writer
|
||||||
|
writer.optimize();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
// close reader
|
||||||
|
reader.close();
|
||||||
|
reader=null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int getMin (int l) {
|
||||||
|
if (l>5) {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
if (l==5) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int getMax (int l) {
|
||||||
|
if (l>5) {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
if (l==5) {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
return 2;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Document createDocument (String text, int ng1, int ng2) {
|
||||||
|
Document doc=new Document();
|
||||||
|
doc.add(Field.Keyword(F_WORD, text)); // orig term
|
||||||
|
addGram(text, doc, ng1, ng2);
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void addGram (String text, Document doc, int ng1, int ng2) {
|
||||||
|
int len=text.length();
|
||||||
|
for (int ng=ng1; ng<=ng2; ng++) {
|
||||||
|
String key="gram"+ng;
|
||||||
|
String end=null;
|
||||||
|
for (int i=0; i<len-ng+1; i++) {
|
||||||
|
String gram=text.substring(i, i+ng);
|
||||||
|
doc.add(Field.Keyword(key, gram));
|
||||||
|
if (i==0) {
|
||||||
|
doc.add(Field.Keyword("start"+ng, gram));
|
||||||
|
}
|
||||||
|
end=gram;
|
||||||
|
}
|
||||||
|
if (end!=null) { // may not be present if len==ng1
|
||||||
|
doc.add(Field.Keyword("end"+ng, end));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected void finalize () throws Throwable {
|
||||||
|
if (reader!=null) {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,64 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2002-2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* SuggestWord Class
|
||||||
|
* used in suggestSimilat method in SpellChecker class
|
||||||
|
* @author Nicolas Maisonneuve
|
||||||
|
*/
|
||||||
|
final class SuggestWord {
|
||||||
|
/**
|
||||||
|
* the score of the word
|
||||||
|
*/
|
||||||
|
public float score;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The freq of the word
|
||||||
|
*/
|
||||||
|
public int freq;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* the suggested word
|
||||||
|
*/
|
||||||
|
public String string;
|
||||||
|
|
||||||
|
|
||||||
|
public final int compareTo (SuggestWord a) {
|
||||||
|
//first criteria: the edit distance
|
||||||
|
if (score>a.score) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (score<a.score) {
|
||||||
|
return-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//second criteria (if first criteria is equal): the popularity
|
||||||
|
if (freq>a.freq) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (freq<a.freq) {
|
||||||
|
return-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2002-2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* to sort SuggestWord
|
||||||
|
* @author Nicolas Maisonneuve
|
||||||
|
*/
|
||||||
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
|
|
||||||
|
final class SuggestWordQueue
|
||||||
|
extends PriorityQueue {
|
||||||
|
|
||||||
|
SuggestWordQueue (int size) {
|
||||||
|
initialize(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected final boolean lessThan (Object a, Object b) {
|
||||||
|
SuggestWord wa=(SuggestWord) a;
|
||||||
|
SuggestWord wb=(SuggestWord) b;
|
||||||
|
int val=wa.compareTo(wb);
|
||||||
|
return val<0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,132 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2002-2004 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Edit distance class
|
||||||
|
*/
|
||||||
|
public final class TRStringDistance {
|
||||||
|
|
||||||
|
final char[] sa;
|
||||||
|
final int n;
|
||||||
|
final int[][][] cache=new int[30][][];
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Optimized to run a bit faster than the static getDistance().
|
||||||
|
* In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
|
||||||
|
*/
|
||||||
|
public TRStringDistance (String target) {
|
||||||
|
sa=target.toCharArray();
|
||||||
|
n=sa.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//*****************************
|
||||||
|
// Compute Levenshtein distance
|
||||||
|
//*****************************
|
||||||
|
public final int getDistance (String other) {
|
||||||
|
int d[][]; // matrix
|
||||||
|
int cost; // cost
|
||||||
|
|
||||||
|
// Step 1
|
||||||
|
final char[] ta=other.toCharArray();
|
||||||
|
final int m=ta.length;
|
||||||
|
if (n==0) {
|
||||||
|
return m;
|
||||||
|
}
|
||||||
|
if (m==0) {
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m>=cache.length) {
|
||||||
|
d=form(n, m);
|
||||||
|
}
|
||||||
|
else if (cache[m]!=null) {
|
||||||
|
d=cache[m];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
d=cache[m]=form(n, m);
|
||||||
|
|
||||||
|
// Step 3
|
||||||
|
|
||||||
|
}
|
||||||
|
for (int i=1; i<=n; i++) {
|
||||||
|
final char s_i=sa[i-1];
|
||||||
|
|
||||||
|
// Step 4
|
||||||
|
|
||||||
|
for (int j=1; j<=m; j++) {
|
||||||
|
final char t_j=ta[j-1];
|
||||||
|
|
||||||
|
// Step 5
|
||||||
|
|
||||||
|
if (s_i==t_j) { // same
|
||||||
|
cost=0;
|
||||||
|
}
|
||||||
|
else { // not a match
|
||||||
|
cost=1;
|
||||||
|
|
||||||
|
// Step 6
|
||||||
|
|
||||||
|
}
|
||||||
|
d[i][j]=min3(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 7
|
||||||
|
return d[n][m];
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
private static int[][] form (int n, int m) {
|
||||||
|
int[][] d=new int[n+1][m+1];
|
||||||
|
// Step 2
|
||||||
|
|
||||||
|
for (int i=0; i<=n; i++) {
|
||||||
|
d[i][0]=i;
|
||||||
|
|
||||||
|
}
|
||||||
|
for (int j=0; j<=m; j++) {
|
||||||
|
d[0][j]=j;
|
||||||
|
}
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//****************************
|
||||||
|
// Get minimum of three values
|
||||||
|
//****************************
|
||||||
|
private static int min3 (int a, int b, int c) {
|
||||||
|
int mi=a;
|
||||||
|
if (b<mi) {
|
||||||
|
mi=b;
|
||||||
|
}
|
||||||
|
if (c<mi) {
|
||||||
|
mi=c;
|
||||||
|
}
|
||||||
|
return mi;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,122 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
|
||||||
|
import junit.framework.*;
|
||||||
|
import org.apache.lucene.search.spell.*;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.util.English;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import org.apache.lucene.store.FSDirectory;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test case
|
||||||
|
* @author Nicolas Maisonneuve
|
||||||
|
*/
|
||||||
|
|
||||||
|
public class TestSpellChecker
|
||||||
|
extends TestCase {
|
||||||
|
private SpellChecker spellChecker;
|
||||||
|
Directory userindex, spellindex;
|
||||||
|
|
||||||
|
protected void setUp () throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
|
||||||
|
//create a user index
|
||||||
|
userindex=new RAMDirectory();
|
||||||
|
IndexWriter writer=new IndexWriter(userindex, new SimpleAnalyzer(), true);
|
||||||
|
|
||||||
|
for (int i=0; i<1000; i++) {
|
||||||
|
Document doc=new Document();
|
||||||
|
doc.add(Field.Text("field1", English.intToEnglish(i)));
|
||||||
|
doc.add(Field.Text("field2", English.intToEnglish(i+1))); // + word thousand
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
// create the spellChecker
|
||||||
|
File file=new File("d://test");
|
||||||
|
spellindex=FSDirectory.getDirectory(file, true);
|
||||||
|
spellChecker=new SpellChecker(spellindex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testBuild () {
|
||||||
|
try {
|
||||||
|
IndexReader r=IndexReader.open(userindex);
|
||||||
|
|
||||||
|
spellChecker.clearIndex();
|
||||||
|
|
||||||
|
addwords(r, "field1");
|
||||||
|
int num_field1=this.numdoc();
|
||||||
|
|
||||||
|
addwords(r, "field2");
|
||||||
|
int num_field2=this.numdoc();
|
||||||
|
|
||||||
|
this.assertTrue(num_field2==num_field1+1);
|
||||||
|
|
||||||
|
// test small word
|
||||||
|
String[] l=spellChecker.suggestSimilar("fvie", 2);
|
||||||
|
this.assertTrue(l[0].equals("five"));
|
||||||
|
|
||||||
|
l=spellChecker.suggestSimilar("fiv", 2);
|
||||||
|
this.assertTrue(l[0].equals("five"));
|
||||||
|
|
||||||
|
l=spellChecker.suggestSimilar("ive", 2);
|
||||||
|
this.assertTrue(l[0].equals("five"));
|
||||||
|
|
||||||
|
l=spellChecker.suggestSimilar("fives", 2);
|
||||||
|
this.assertTrue(l[0].equals("five"));
|
||||||
|
|
||||||
|
l=spellChecker.suggestSimilar("fie", 2);
|
||||||
|
this.assertTrue(l[0].equals("five"));
|
||||||
|
|
||||||
|
l=spellChecker.suggestSimilar("fi", 2);
|
||||||
|
this.assertEquals(0,l.length);
|
||||||
|
|
||||||
|
// test restreint to a field
|
||||||
|
l=spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
|
||||||
|
this.assertEquals(0,l.length); // there isn't the term thousand in the field field1
|
||||||
|
|
||||||
|
l=spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
|
||||||
|
this.assertEquals(1,l.length); // there is the term thousand in the field field2
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
this.assertTrue(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void addwords (IndexReader r, String field) throws IOException {
|
||||||
|
long time=System.currentTimeMillis();
|
||||||
|
spellChecker.indexDictionnary(new LuceneDictionary(r, field));
|
||||||
|
time=System.currentTimeMillis()-time;
|
||||||
|
System.out.println("time to build "+field+": "+time);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int numdoc () throws IOException {
|
||||||
|
IndexReader rs=IndexReader.open(spellindex);
|
||||||
|
int num=rs.numDocs();
|
||||||
|
this.assertTrue(num!=0);
|
||||||
|
System.out.println("num docs: "+num);
|
||||||
|
rs.close();
|
||||||
|
return num;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected void tearDown () throws Exception {
|
||||||
|
spellChecker=null;
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue