LUCENE-3436: add spellchecker SuggestMode

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1171556 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-09-16 13:41:29 +00:00
parent 4f553352d1
commit c0965ed3a2
7 changed files with 265 additions and 44 deletions

View File

@ -92,6 +92,11 @@ Bug Fixes
* LUCENE-3019: Fix unexpected color tags for FastVectorHighlighter. (Koji Sekiguchi)
API Changes
* LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
for suggesting related terms. (James Dyer via Robert Muir)
======================= Lucene 3.4.0 ================
New Features

View File

@ -290,21 +290,22 @@ public class DirectSpellChecker {
}
/**
* Calls {@link #suggestSimilar(Term, int, IndexReader, boolean)
* suggestSimilar(term, numSug, ir, false)}
* Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode)
* suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)}
*/
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir)
throws IOException {
return suggestSimilar(term, numSug, ir, false);
return suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
}
/**
* Calls {@link #suggestSimilar(Term, int, IndexReader, boolean, float)
* suggestSimilar(term, numSug, ir, morePopular, this.accuracy)}
* Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float)
* suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)}
*
*/
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
boolean morePopular) throws IOException {
return suggestSimilar(term, numSug, ir, morePopular, accuracy);
SuggestMode suggestMode) throws IOException {
return suggestSimilar(term, numSug, ir, suggestMode, this.accuracy);
}
/**
@ -323,7 +324,7 @@ public class DirectSpellChecker {
* @throws IOException
*/
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
boolean morePopular, float accuracy) throws IOException {
SuggestMode suggestMode, float accuracy) throws IOException {
final CharsRef spare = new CharsRef();
String text = term.text();
if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
@ -335,9 +336,7 @@ public class DirectSpellChecker {
int docfreq = ir.docFreq(term);
// see line 341 of spellchecker. this is certainly very very nice for perf,
// but is it really the right way to go?
if (!morePopular && docfreq > 0) {
if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
return new SuggestWord[0];
}
@ -349,7 +348,7 @@ public class DirectSpellChecker {
return new SuggestWord[0];
}
if (!morePopular) docfreq = 0;
if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0;
if (thresholdFrequency >= 1f) {
docfreq = Math.max(docfreq, (int) thresholdFrequency);

View File

@ -247,7 +247,7 @@ public class SpellChecker implements java.io.Closeable {
* @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
*/
public String[] suggestSimilar(String word, int numSug) throws IOException {
return this.suggestSimilar(word, numSug, null, null, false);
return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
}
/**
@ -271,7 +271,7 @@ public class SpellChecker implements java.io.Closeable {
* @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
*/
public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
return this.suggestSimilar(word, numSug, null, null, false, accuracy);
return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
}
/**
@ -300,8 +300,16 @@ public class SpellChecker implements java.io.Closeable {
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
* of the suggest words in the field of the user index
*
* @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
* @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
*
* @deprecated
* use suggestSimilar(String, int, IndexReader, String, SuggestMode)
* <ul>
* <li>SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX instead of morePopular=false</li>
* <li>SuggestMode.SuGGEST_MORE_POPULAR instead of morePopular=true</li>
* </ul>
*/
@Deprecated
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
String field, boolean morePopular) throws IOException {
return suggestSimilar(word, numSug, ir, field, morePopular, accuracy);
@ -332,19 +340,78 @@ public class SpellChecker implements java.io.Closeable {
* @return String[] the sorted list of the suggest words with these 2 criteria:
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
* of the suggest words in the field of the user index
*
* @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
*
* @deprecated
* use suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
* <ul>
* <li>SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX instead of morePopular=false</li>
* <li>SuggestMode.SuGGEST_MORE_POPULAR instead of morePopular=true</li>
* </ul>
*/
@Deprecated
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
String field, boolean morePopular, float accuracy) throws IOException {
return suggestSimilar(word, numSug, ir, field, morePopular ? SuggestMode.SUGGEST_MORE_POPULAR :
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
}
/**
* Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
* suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)}
*
*/
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
String field, SuggestMode suggestMode) throws IOException {
return suggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy);
}
/**
* Suggest similar words (optionally restricted to a field of an index).
*
* <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
* is not the same as the edit distance strategy used to calculate the best
* matching spell-checked word from the hits that Lucene found, one usually has
* to retrieve a couple of numSug's in order to get the true best match.
*
* <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
* Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
*
* @param word the word you want a spell check done on
* @param numSug the number of suggested words
* @param ir the indexReader of the user index (can be null see field param)
* @param field the field of the user index: if field is not null, the suggested
* words are restricted to the words present in this field.
* @param suggestMode
* (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS)
* @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
* @throws IOException if the underlying index throws an {@link IOException}
* @throws AlreadyClosedException if the Spellchecker is already closed
* @return String[] the sorted list of the suggest words with these 2 criteria:
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
* of the suggest words in the field of the user index
*
*/
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
String field, SuggestMode suggestMode, float accuracy) throws IOException {
// obtainSearcher calls ensureOpen
final IndexSearcher indexSearcher = obtainSearcher();
try{
try {
if (ir == null || field == null) {
suggestMode = SuggestMode.SUGGEST_ALWAYS;
}
if (suggestMode == SuggestMode.SUGGEST_ALWAYS) {
ir = null;
field = null;
}
final int lengthWord = word.length();
final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
final int goalFreq = suggestMode==SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
// if the word exists in the real index and we don't care for word frequency, return the word itself
if (!morePopular && freq > 0) {
if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) {
return new String[] { word };
}
@ -403,7 +470,7 @@ public class SpellChecker implements java.io.Closeable {
if (ir != null && field != null) { // use the user index
sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
// don't suggest a word that is not present in the field
if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
if ((suggestMode==SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) {
continue;
}
}

View File

@ -0,0 +1,42 @@
package org.apache.lucene.search.spell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Set of strategies for suggesting related terms
* @lucene.experimental
*/
public enum SuggestMode {
/**
* Generate suggestions only for terms not in the index (default)
*/
SUGGEST_WHEN_NOT_IN_INDEX,
/**
* Return only suggested words that are as frequent or more frequent than the
* searched word
*/
SUGGEST_MORE_POPULAR,
/**
* Always attempt to offer suggestions (however, other parameters may limit
* suggestions. For example, see
* {@link DirectSpellChecker.setMaxQueryFrequency} ).
*/
SUGGEST_ALWAYS
}

View File

@ -45,29 +45,35 @@ public class TestDirectSpellChecker extends LuceneTestCase {
IndexReader ir = writer.getReader();
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers",
"fvie"), 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir, false);
similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
if (similar.length > 0) {
assertFalse(similar[0].string.equals("five")); // don't suggest a word for itself
}
similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir, false);
similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir, false);
similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
assertTrue(similar.length > 0);
similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir, false);
similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals("five", similar[0].string);
// add some more documents
@ -81,7 +87,8 @@ public class TestDirectSpellChecker extends LuceneTestCase {
ir = writer.getReader();
// look ma, no spellcheck index rebuild
similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10, ir, false);
similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10,
ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("thousand", similar[0].string);
@ -109,34 +116,48 @@ public class TestDirectSpellChecker extends LuceneTestCase {
DirectSpellChecker spellChecker = new DirectSpellChecker();
spellChecker.setMaxQueryFrequency(0F);
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 1, ir, true);
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text",
"fobar"), 1, ir, SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMinQueryLength(5);
similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir, true);
similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMaxEdits(1);
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setAccuracy(0.9F);
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMinPrefix(0);
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(1, similar.length);
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMinPrefix(1);
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMaxEdits(2);
similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 2, ir,
SuggestMode.SUGGEST_ALWAYS);
assertEquals(2, similar.length);
ir.close();
writer.close();
dir.close();
@ -156,7 +177,9 @@ public class TestDirectSpellChecker extends LuceneTestCase {
IndexReader ir = writer.getReader();
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("bogusFieldBogusField", "fvie"), 2, ir, false);
SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
"bogusFieldBogusField", "fvie"), 2, ir,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(0, similar.length);
ir.close();
writer.close();

View File

@ -63,6 +63,29 @@ public class TestSpellChecker extends LuceneTestCase {
doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), TextField.TYPE_STORED)); // + word thousand
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newField("field1", "eight", TextField.TYPE_STORED)); // "eight" in
// the index
// twice
writer.addDocument(doc);
}
{
Document doc = new Document();
doc
.add(newField("field1", "twenty-one twenty-one",
TextField.TYPE_STORED)); // "twenty-one" in the index thrice
writer.addDocument(doc);
}
{
Document doc = new Document();
doc.add(newField("field1", "twenty", TextField.TYPE_STORED)); // "twenty"
// in the
// index
// twice
writer.addDocument(doc);
}
writer.close();
searchers = Collections.synchronizedList(new ArrayList<IndexSearcher>());
// create the spellChecker
@ -126,7 +149,8 @@ public class TestSpellChecker extends LuceneTestCase {
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
addwords(r, compareSP, "field3");
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false);
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3",
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length == 2);
//five and fvei have the same score, but different frequencies.
assertEquals("fvei", similar[0]);
@ -143,7 +167,8 @@ public class TestSpellChecker extends LuceneTestCase {
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
addwords(r, compareSP, "field3");
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "bogusFieldBogusField", false);
String[] similar = compareSP.suggestSimilar("fvie", 2, r,
"bogusFieldBogusField", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(0, similar.length);
r.close();
if (!compareSP.isClosed())
@ -151,6 +176,60 @@ public class TestSpellChecker extends LuceneTestCase {
compIdx.close();
}
public void testSuggestModes() throws Exception {
IndexReader r = IndexReader.open(userindex, true);
spellChecker.clearIndex();
addwords(r, spellChecker, "field1");
{
String[] similar = spellChecker.suggestSimilar("eighty", 2, r, "field1",
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(1, similar.length);
assertEquals("eighty", similar[0]);
}
{
String[] similar = spellChecker.suggestSimilar("eight", 2, r, "field1",
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(1, similar.length);
assertEquals("eight", similar[0]);
}
{
String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1",
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(5, similar.length);
assertEquals("eight", similar[0]);
}
{
String[] similar = spellChecker.suggestSimilar("twenty", 5, r, "field1",
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(1, similar.length);
assertEquals("twenty-one", similar[0]);
}
{
String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1",
SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
}
{
String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1",
SuggestMode.SUGGEST_ALWAYS);
assertEquals(5, similar.length);
assertEquals("eight", similar[0]);
}
{
String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1",
SuggestMode.SUGGEST_ALWAYS);
assertEquals(5, similar.length);
assertEquals("eighty", similar[0]);
}
r.close();
}
private void checkCommonSuggestions(IndexReader r) throws IOException {
String[] similar = spellChecker.suggestSimilar("fvie", 2);
assertTrue(similar.length > 0);
@ -174,10 +253,12 @@ public class TestSpellChecker extends LuceneTestCase {
assertEquals(similar[0], "five");
// test restraint to a field
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1",
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2",
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(1, similar.length); // there is the term thousand in the field field2
}
@ -214,10 +295,12 @@ public class TestSpellChecker extends LuceneTestCase {
assertEquals(similar[0], "five");
// test restraint to a field
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1",
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2",
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(1, similar.length); // there is the term thousand in the field field2
similar = spellChecker.suggestSimilar("onety", 2);
@ -225,7 +308,8 @@ public class TestSpellChecker extends LuceneTestCase {
assertEquals(similar[0], "ninety");
assertEquals(similar[1], "one");
try {
similar = spellChecker.suggestSimilar("tousand", 10, r, null, false);
similar = spellChecker.suggestSimilar("tousand", 10, r, null,
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
} catch (NullPointerException e) {
assertTrue("threw an NPE, and it shouldn't have", false);
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
import org.apache.lucene.search.spell.SuggestWordQueue;
@ -195,11 +196,11 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
SpellingResult result = new SpellingResult();
float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;
SuggestMode mode = options.onlyMorePopular ? SuggestMode.SUGGEST_MORE_POPULAR : SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
for (Token token : options.tokens) {
Term term = new Term(field, token.toString());
SuggestWord[] suggestions = checker.suggestSimilar(term,
options.count, options.reader, options.onlyMorePopular, accuracy);
options.count, options.reader, mode, accuracy);
result.addFrequency(token, options.reader.docFreq(term));
for (SuggestWord suggestion : suggestions) {
result.add(token, suggestion.string, suggestion.freq);