mirror of https://github.com/apache/lucene.git
LUCENE-3436: add spellchecker SuggestMode
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1171556 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4f553352d1
commit
c0965ed3a2
|
@ -92,6 +92,11 @@ Bug Fixes
|
||||||
|
|
||||||
* LUCENE-3019: Fix unexpected color tags for FastVectorHighlighter. (Koji Sekiguchi)
|
* LUCENE-3019: Fix unexpected color tags for FastVectorHighlighter. (Koji Sekiguchi)
|
||||||
|
|
||||||
|
API Changes
|
||||||
|
|
||||||
|
* LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy
|
||||||
|
for suggesting related terms. (James Dyer via Robert Muir)
|
||||||
|
|
||||||
======================= Lucene 3.4.0 ================
|
======================= Lucene 3.4.0 ================
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
|
@ -290,21 +290,22 @@ public class DirectSpellChecker {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls {@link #suggestSimilar(Term, int, IndexReader, boolean)
|
* Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode)
|
||||||
* suggestSimilar(term, numSug, ir, false)}
|
* suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)}
|
||||||
*/
|
*/
|
||||||
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir)
|
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return suggestSimilar(term, numSug, ir, false);
|
return suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls {@link #suggestSimilar(Term, int, IndexReader, boolean, float)
|
* Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float)
|
||||||
* suggestSimilar(term, numSug, ir, morePopular, this.accuracy)}
|
* suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)}
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
|
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
|
||||||
boolean morePopular) throws IOException {
|
SuggestMode suggestMode) throws IOException {
|
||||||
return suggestSimilar(term, numSug, ir, morePopular, accuracy);
|
return suggestSimilar(term, numSug, ir, suggestMode, this.accuracy);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -323,7 +324,7 @@ public class DirectSpellChecker {
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
|
public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir,
|
||||||
boolean morePopular, float accuracy) throws IOException {
|
SuggestMode suggestMode, float accuracy) throws IOException {
|
||||||
final CharsRef spare = new CharsRef();
|
final CharsRef spare = new CharsRef();
|
||||||
String text = term.text();
|
String text = term.text();
|
||||||
if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
|
if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength)
|
||||||
|
@ -335,9 +336,7 @@ public class DirectSpellChecker {
|
||||||
|
|
||||||
int docfreq = ir.docFreq(term);
|
int docfreq = ir.docFreq(term);
|
||||||
|
|
||||||
// see line 341 of spellchecker. this is certainly very very nice for perf,
|
if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) {
|
||||||
// but is it really the right way to go?
|
|
||||||
if (!morePopular && docfreq > 0) {
|
|
||||||
return new SuggestWord[0];
|
return new SuggestWord[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -349,7 +348,7 @@ public class DirectSpellChecker {
|
||||||
return new SuggestWord[0];
|
return new SuggestWord[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!morePopular) docfreq = 0;
|
if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0;
|
||||||
|
|
||||||
if (thresholdFrequency >= 1f) {
|
if (thresholdFrequency >= 1f) {
|
||||||
docfreq = Math.max(docfreq, (int) thresholdFrequency);
|
docfreq = Math.max(docfreq, (int) thresholdFrequency);
|
||||||
|
|
|
@ -247,7 +247,7 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
* @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
|
* @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
|
||||||
*/
|
*/
|
||||||
public String[] suggestSimilar(String word, int numSug) throws IOException {
|
public String[] suggestSimilar(String word, int numSug) throws IOException {
|
||||||
return this.suggestSimilar(word, numSug, null, null, false);
|
return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -271,7 +271,7 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
* @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
|
* @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
|
||||||
*/
|
*/
|
||||||
public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
|
public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
|
||||||
return this.suggestSimilar(word, numSug, null, null, false, accuracy);
|
return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -300,8 +300,16 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
|
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
|
||||||
* of the suggest words in the field of the user index
|
* of the suggest words in the field of the user index
|
||||||
*
|
*
|
||||||
* @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float)
|
* @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
|
||||||
|
*
|
||||||
|
* @deprecated
|
||||||
|
* use suggestSimilar(String, int, IndexReader, String, SuggestMode)
|
||||||
|
* <ul>
|
||||||
|
* <li>SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX instead of morePopular=false</li>
|
||||||
|
* <li>SuggestMode.SuGGEST_MORE_POPULAR instead of morePopular=true</li>
|
||||||
|
* </ul>
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
|
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
|
||||||
String field, boolean morePopular) throws IOException {
|
String field, boolean morePopular) throws IOException {
|
||||||
return suggestSimilar(word, numSug, ir, field, morePopular, accuracy);
|
return suggestSimilar(word, numSug, ir, field, morePopular, accuracy);
|
||||||
|
@ -332,19 +340,78 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
* @return String[] the sorted list of the suggest words with these 2 criteria:
|
* @return String[] the sorted list of the suggest words with these 2 criteria:
|
||||||
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
|
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
|
||||||
* of the suggest words in the field of the user index
|
* of the suggest words in the field of the user index
|
||||||
|
*
|
||||||
|
* @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
|
||||||
|
*
|
||||||
|
* @deprecated
|
||||||
|
* use suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
|
||||||
|
* <ul>
|
||||||
|
* <li>SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX instead of morePopular=false</li>
|
||||||
|
* <li>SuggestMode.SuGGEST_MORE_POPULAR instead of morePopular=true</li>
|
||||||
|
* </ul>
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
|
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
|
||||||
String field, boolean morePopular, float accuracy) throws IOException {
|
String field, boolean morePopular, float accuracy) throws IOException {
|
||||||
|
return suggestSimilar(word, numSug, ir, field, morePopular ? SuggestMode.SUGGEST_MORE_POPULAR :
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calls {@link #suggestSimilar(String, int, IndexReader, String, SuggestMode, float)
|
||||||
|
* suggestSimilar(word, numSug, ir, suggestMode, field, this.accuracy)}
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
|
||||||
|
String field, SuggestMode suggestMode) throws IOException {
|
||||||
|
return suggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Suggest similar words (optionally restricted to a field of an index).
|
||||||
|
*
|
||||||
|
* <p>As the Lucene similarity that is used to fetch the most relevant n-grammed terms
|
||||||
|
* is not the same as the edit distance strategy used to calculate the best
|
||||||
|
* matching spell-checked word from the hits that Lucene found, one usually has
|
||||||
|
* to retrieve a couple of numSug's in order to get the true best match.
|
||||||
|
*
|
||||||
|
* <p>I.e. if numSug == 1, don't count on that suggestion being the best one.
|
||||||
|
* Thus, you should set this value to <b>at least</b> 5 for a good suggestion.
|
||||||
|
*
|
||||||
|
* @param word the word you want a spell check done on
|
||||||
|
* @param numSug the number of suggested words
|
||||||
|
* @param ir the indexReader of the user index (can be null see field param)
|
||||||
|
* @param field the field of the user index: if field is not null, the suggested
|
||||||
|
* words are restricted to the words present in this field.
|
||||||
|
* @param suggestMode
|
||||||
|
* (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS)
|
||||||
|
* @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
|
||||||
|
* @throws IOException if the underlying index throws an {@link IOException}
|
||||||
|
* @throws AlreadyClosedException if the Spellchecker is already closed
|
||||||
|
* @return String[] the sorted list of the suggest words with these 2 criteria:
|
||||||
|
* first criteria: the edit distance, second criteria (only if restricted mode): the popularity
|
||||||
|
* of the suggest words in the field of the user index
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public String[] suggestSimilar(String word, int numSug, IndexReader ir,
|
||||||
|
String field, SuggestMode suggestMode, float accuracy) throws IOException {
|
||||||
// obtainSearcher calls ensureOpen
|
// obtainSearcher calls ensureOpen
|
||||||
final IndexSearcher indexSearcher = obtainSearcher();
|
final IndexSearcher indexSearcher = obtainSearcher();
|
||||||
try{
|
try {
|
||||||
|
if (ir == null || field == null) {
|
||||||
|
suggestMode = SuggestMode.SUGGEST_ALWAYS;
|
||||||
|
}
|
||||||
|
if (suggestMode == SuggestMode.SUGGEST_ALWAYS) {
|
||||||
|
ir = null;
|
||||||
|
field = null;
|
||||||
|
}
|
||||||
|
|
||||||
final int lengthWord = word.length();
|
final int lengthWord = word.length();
|
||||||
|
|
||||||
final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
|
final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
|
||||||
final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
|
final int goalFreq = suggestMode==SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
|
||||||
// if the word exists in the real index and we don't care for word frequency, return the word itself
|
// if the word exists in the real index and we don't care for word frequency, return the word itself
|
||||||
if (!morePopular && freq > 0) {
|
if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) {
|
||||||
return new String[] { word };
|
return new String[] { word };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -403,7 +470,7 @@ public class SpellChecker implements java.io.Closeable {
|
||||||
if (ir != null && field != null) { // use the user index
|
if (ir != null && field != null) { // use the user index
|
||||||
sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
|
sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
|
||||||
// don't suggest a word that is not present in the field
|
// don't suggest a word that is not present in the field
|
||||||
if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
|
if ((suggestMode==SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,42 @@
|
||||||
|
package org.apache.lucene.search.spell;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set of strategies for suggesting related terms
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public enum SuggestMode {
|
||||||
|
/**
|
||||||
|
* Generate suggestions only for terms not in the index (default)
|
||||||
|
*/
|
||||||
|
SUGGEST_WHEN_NOT_IN_INDEX,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return only suggested words that are as frequent or more frequent than the
|
||||||
|
* searched word
|
||||||
|
*/
|
||||||
|
SUGGEST_MORE_POPULAR,
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Always attempt to offer suggestions (however, other parameters may limit
|
||||||
|
* suggestions. For example, see
|
||||||
|
* {@link DirectSpellChecker.setMaxQueryFrequency} ).
|
||||||
|
*/
|
||||||
|
SUGGEST_ALWAYS
|
||||||
|
}
|
|
@ -45,29 +45,35 @@ public class TestDirectSpellChecker extends LuceneTestCase {
|
||||||
|
|
||||||
IndexReader ir = writer.getReader();
|
IndexReader ir = writer.getReader();
|
||||||
|
|
||||||
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
|
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers",
|
||||||
|
"fvie"), 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertTrue(similar.length > 0);
|
assertTrue(similar.length > 0);
|
||||||
assertEquals("five", similar[0].string);
|
assertEquals("five", similar[0].string);
|
||||||
|
|
||||||
similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir, false);
|
similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir,
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
if (similar.length > 0) {
|
if (similar.length > 0) {
|
||||||
assertFalse(similar[0].string.equals("five")); // don't suggest a word for itself
|
assertFalse(similar[0].string.equals("five")); // don't suggest a word for itself
|
||||||
}
|
}
|
||||||
|
|
||||||
similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
|
similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir,
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertTrue(similar.length > 0);
|
assertTrue(similar.length > 0);
|
||||||
assertEquals("five", similar[0].string);
|
assertEquals("five", similar[0].string);
|
||||||
|
|
||||||
similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir, false);
|
similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir,
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertTrue(similar.length > 0);
|
assertTrue(similar.length > 0);
|
||||||
assertEquals("five", similar[0].string);
|
assertEquals("five", similar[0].string);
|
||||||
|
|
||||||
similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir, false);
|
similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir,
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertTrue(similar.length > 0);
|
assertTrue(similar.length > 0);
|
||||||
assertEquals("five", similar[0].string);
|
assertEquals("five", similar[0].string);
|
||||||
|
|
||||||
assertTrue(similar.length > 0);
|
assertTrue(similar.length > 0);
|
||||||
similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir, false);
|
similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir,
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertEquals("five", similar[0].string);
|
assertEquals("five", similar[0].string);
|
||||||
|
|
||||||
// add some more documents
|
// add some more documents
|
||||||
|
@ -81,7 +87,8 @@ public class TestDirectSpellChecker extends LuceneTestCase {
|
||||||
ir = writer.getReader();
|
ir = writer.getReader();
|
||||||
|
|
||||||
// look ma, no spellcheck index rebuild
|
// look ma, no spellcheck index rebuild
|
||||||
similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10, ir, false);
|
similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10,
|
||||||
|
ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertTrue(similar.length > 0);
|
assertTrue(similar.length > 0);
|
||||||
assertEquals("thousand", similar[0].string);
|
assertEquals("thousand", similar[0].string);
|
||||||
|
|
||||||
|
@ -109,34 +116,48 @@ public class TestDirectSpellChecker extends LuceneTestCase {
|
||||||
|
|
||||||
DirectSpellChecker spellChecker = new DirectSpellChecker();
|
DirectSpellChecker spellChecker = new DirectSpellChecker();
|
||||||
spellChecker.setMaxQueryFrequency(0F);
|
spellChecker.setMaxQueryFrequency(0F);
|
||||||
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 1, ir, true);
|
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text",
|
||||||
|
"fobar"), 1, ir, SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
assertEquals(0, similar.length);
|
assertEquals(0, similar.length);
|
||||||
|
|
||||||
spellChecker = new DirectSpellChecker(); // reset defaults
|
spellChecker = new DirectSpellChecker(); // reset defaults
|
||||||
spellChecker.setMinQueryLength(5);
|
spellChecker.setMinQueryLength(5);
|
||||||
similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir, true);
|
similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir,
|
||||||
|
SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
assertEquals(0, similar.length);
|
assertEquals(0, similar.length);
|
||||||
|
|
||||||
spellChecker = new DirectSpellChecker(); // reset defaults
|
spellChecker = new DirectSpellChecker(); // reset defaults
|
||||||
spellChecker.setMaxEdits(1);
|
spellChecker.setMaxEdits(1);
|
||||||
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
|
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir,
|
||||||
|
SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
assertEquals(0, similar.length);
|
assertEquals(0, similar.length);
|
||||||
|
|
||||||
spellChecker = new DirectSpellChecker(); // reset defaults
|
spellChecker = new DirectSpellChecker(); // reset defaults
|
||||||
spellChecker.setAccuracy(0.9F);
|
spellChecker.setAccuracy(0.9F);
|
||||||
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
|
similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir,
|
||||||
|
SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
assertEquals(0, similar.length);
|
assertEquals(0, similar.length);
|
||||||
|
|
||||||
spellChecker = new DirectSpellChecker(); // reset defaults
|
spellChecker = new DirectSpellChecker(); // reset defaults
|
||||||
spellChecker.setMinPrefix(0);
|
spellChecker.setMinPrefix(0);
|
||||||
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
|
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
|
||||||
|
SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
assertEquals(1, similar.length);
|
assertEquals(1, similar.length);
|
||||||
|
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
|
||||||
|
SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
|
|
||||||
spellChecker = new DirectSpellChecker(); // reset defaults
|
spellChecker = new DirectSpellChecker(); // reset defaults
|
||||||
spellChecker.setMinPrefix(1);
|
spellChecker.setMinPrefix(1);
|
||||||
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
|
similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
|
||||||
|
SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
assertEquals(0, similar.length);
|
assertEquals(0, similar.length);
|
||||||
|
|
||||||
|
spellChecker = new DirectSpellChecker(); // reset defaults
|
||||||
|
spellChecker.setMaxEdits(2);
|
||||||
|
similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 2, ir,
|
||||||
|
SuggestMode.SUGGEST_ALWAYS);
|
||||||
|
assertEquals(2, similar.length);
|
||||||
|
|
||||||
ir.close();
|
ir.close();
|
||||||
writer.close();
|
writer.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
|
@ -156,7 +177,9 @@ public class TestDirectSpellChecker extends LuceneTestCase {
|
||||||
|
|
||||||
IndexReader ir = writer.getReader();
|
IndexReader ir = writer.getReader();
|
||||||
|
|
||||||
SuggestWord[] similar = spellChecker.suggestSimilar(new Term("bogusFieldBogusField", "fvie"), 2, ir, false);
|
SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
|
||||||
|
"bogusFieldBogusField", "fvie"), 2, ir,
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertEquals(0, similar.length);
|
assertEquals(0, similar.length);
|
||||||
ir.close();
|
ir.close();
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
|
@ -63,6 +63,29 @@ public class TestSpellChecker extends LuceneTestCase {
|
||||||
doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), TextField.TYPE_STORED)); // + word thousand
|
doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), TextField.TYPE_STORED)); // + word thousand
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField("field1", "eight", TextField.TYPE_STORED)); // "eight" in
|
||||||
|
// the index
|
||||||
|
// twice
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
Document doc = new Document();
|
||||||
|
doc
|
||||||
|
.add(newField("field1", "twenty-one twenty-one",
|
||||||
|
TextField.TYPE_STORED)); // "twenty-one" in the index thrice
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField("field1", "twenty", TextField.TYPE_STORED)); // "twenty"
|
||||||
|
// in the
|
||||||
|
// index
|
||||||
|
// twice
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
writer.close();
|
writer.close();
|
||||||
searchers = Collections.synchronizedList(new ArrayList<IndexSearcher>());
|
searchers = Collections.synchronizedList(new ArrayList<IndexSearcher>());
|
||||||
// create the spellChecker
|
// create the spellChecker
|
||||||
|
@ -126,7 +149,8 @@ public class TestSpellChecker extends LuceneTestCase {
|
||||||
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
|
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
|
||||||
addwords(r, compareSP, "field3");
|
addwords(r, compareSP, "field3");
|
||||||
|
|
||||||
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false);
|
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3",
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertTrue(similar.length == 2);
|
assertTrue(similar.length == 2);
|
||||||
//five and fvei have the same score, but different frequencies.
|
//five and fvei have the same score, but different frequencies.
|
||||||
assertEquals("fvei", similar[0]);
|
assertEquals("fvei", similar[0]);
|
||||||
|
@ -143,7 +167,8 @@ public class TestSpellChecker extends LuceneTestCase {
|
||||||
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
|
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
|
||||||
addwords(r, compareSP, "field3");
|
addwords(r, compareSP, "field3");
|
||||||
|
|
||||||
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "bogusFieldBogusField", false);
|
String[] similar = compareSP.suggestSimilar("fvie", 2, r,
|
||||||
|
"bogusFieldBogusField", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertEquals(0, similar.length);
|
assertEquals(0, similar.length);
|
||||||
r.close();
|
r.close();
|
||||||
if (!compareSP.isClosed())
|
if (!compareSP.isClosed())
|
||||||
|
@ -151,6 +176,60 @@ public class TestSpellChecker extends LuceneTestCase {
|
||||||
compIdx.close();
|
compIdx.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSuggestModes() throws Exception {
|
||||||
|
IndexReader r = IndexReader.open(userindex, true);
|
||||||
|
spellChecker.clearIndex();
|
||||||
|
addwords(r, spellChecker, "field1");
|
||||||
|
|
||||||
|
{
|
||||||
|
String[] similar = spellChecker.suggestSimilar("eighty", 2, r, "field1",
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
|
assertEquals(1, similar.length);
|
||||||
|
assertEquals("eighty", similar[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
String[] similar = spellChecker.suggestSimilar("eight", 2, r, "field1",
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
|
assertEquals(1, similar.length);
|
||||||
|
assertEquals("eight", similar[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1",
|
||||||
|
SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
|
assertEquals(5, similar.length);
|
||||||
|
assertEquals("eight", similar[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
String[] similar = spellChecker.suggestSimilar("twenty", 5, r, "field1",
|
||||||
|
SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
|
assertEquals(1, similar.length);
|
||||||
|
assertEquals("twenty-one", similar[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1",
|
||||||
|
SuggestMode.SUGGEST_MORE_POPULAR);
|
||||||
|
assertEquals(0, similar.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1",
|
||||||
|
SuggestMode.SUGGEST_ALWAYS);
|
||||||
|
assertEquals(5, similar.length);
|
||||||
|
assertEquals("eight", similar[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1",
|
||||||
|
SuggestMode.SUGGEST_ALWAYS);
|
||||||
|
assertEquals(5, similar.length);
|
||||||
|
assertEquals("eighty", similar[0]);
|
||||||
|
}
|
||||||
|
r.close();
|
||||||
|
}
|
||||||
private void checkCommonSuggestions(IndexReader r) throws IOException {
|
private void checkCommonSuggestions(IndexReader r) throws IOException {
|
||||||
String[] similar = spellChecker.suggestSimilar("fvie", 2);
|
String[] similar = spellChecker.suggestSimilar("fvie", 2);
|
||||||
assertTrue(similar.length > 0);
|
assertTrue(similar.length > 0);
|
||||||
|
@ -174,10 +253,12 @@ public class TestSpellChecker extends LuceneTestCase {
|
||||||
assertEquals(similar[0], "five");
|
assertEquals(similar[0], "five");
|
||||||
|
|
||||||
// test restraint to a field
|
// test restraint to a field
|
||||||
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
|
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1",
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
|
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
|
||||||
|
|
||||||
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
|
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2",
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertEquals(1, similar.length); // there is the term thousand in the field field2
|
assertEquals(1, similar.length); // there is the term thousand in the field field2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -214,10 +295,12 @@ public class TestSpellChecker extends LuceneTestCase {
|
||||||
assertEquals(similar[0], "five");
|
assertEquals(similar[0], "five");
|
||||||
|
|
||||||
// test restraint to a field
|
// test restraint to a field
|
||||||
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", false);
|
similar = spellChecker.suggestSimilar("tousand", 10, r, "field1",
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
|
assertEquals(0, similar.length); // there isn't the term thousand in the field field1
|
||||||
|
|
||||||
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", false);
|
similar = spellChecker.suggestSimilar("tousand", 10, r, "field2",
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
assertEquals(1, similar.length); // there is the term thousand in the field field2
|
assertEquals(1, similar.length); // there is the term thousand in the field field2
|
||||||
|
|
||||||
similar = spellChecker.suggestSimilar("onety", 2);
|
similar = spellChecker.suggestSimilar("onety", 2);
|
||||||
|
@ -225,7 +308,8 @@ public class TestSpellChecker extends LuceneTestCase {
|
||||||
assertEquals(similar[0], "ninety");
|
assertEquals(similar[0], "ninety");
|
||||||
assertEquals(similar[1], "one");
|
assertEquals(similar[1], "one");
|
||||||
try {
|
try {
|
||||||
similar = spellChecker.suggestSimilar("tousand", 10, r, null, false);
|
similar = spellChecker.suggestSimilar("tousand", 10, r, null,
|
||||||
|
SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
|
||||||
} catch (NullPointerException e) {
|
} catch (NullPointerException e) {
|
||||||
assertTrue("threw an NPE, and it shouldn't have", false);
|
assertTrue("threw an NPE, and it shouldn't have", false);
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.spell.DirectSpellChecker;
|
import org.apache.lucene.search.spell.DirectSpellChecker;
|
||||||
import org.apache.lucene.search.spell.StringDistance;
|
import org.apache.lucene.search.spell.StringDistance;
|
||||||
|
import org.apache.lucene.search.spell.SuggestMode;
|
||||||
import org.apache.lucene.search.spell.SuggestWord;
|
import org.apache.lucene.search.spell.SuggestWord;
|
||||||
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
|
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
|
||||||
import org.apache.lucene.search.spell.SuggestWordQueue;
|
import org.apache.lucene.search.spell.SuggestWordQueue;
|
||||||
|
@ -195,11 +196,11 @@ public class DirectSolrSpellChecker extends SolrSpellChecker {
|
||||||
|
|
||||||
SpellingResult result = new SpellingResult();
|
SpellingResult result = new SpellingResult();
|
||||||
float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;
|
float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;
|
||||||
|
SuggestMode mode = options.onlyMorePopular ? SuggestMode.SUGGEST_MORE_POPULAR : SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX;
|
||||||
for (Token token : options.tokens) {
|
for (Token token : options.tokens) {
|
||||||
Term term = new Term(field, token.toString());
|
Term term = new Term(field, token.toString());
|
||||||
SuggestWord[] suggestions = checker.suggestSimilar(term,
|
SuggestWord[] suggestions = checker.suggestSimilar(term,
|
||||||
options.count, options.reader, options.onlyMorePopular, accuracy);
|
options.count, options.reader, mode, accuracy);
|
||||||
result.addFrequency(token, options.reader.docFreq(term));
|
result.addFrequency(token, options.reader.docFreq(term));
|
||||||
for (SuggestWord suggestion : suggestions) {
|
for (SuggestWord suggestion : suggestions) {
|
||||||
result.add(token, suggestion.string, suggestion.freq);
|
result.add(token, suggestion.string, suggestion.freq);
|
||||||
|
|
Loading…
Reference in New Issue