diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index f9474ffeb61..2a670b5c5cd 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -92,6 +92,11 @@ Bug Fixes * LUCENE-3019: Fix unexpected color tags for FastVectorHighlighter. (Koji Sekiguchi) +API Changes + + * LUCENE-3436: Add SuggestMode to the spellchecker, so you can specify the strategy + for suggesting related terms. (James Dyer via Robert Muir) + ======================= Lucene 3.4.0 ================ New Features diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java b/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java index 66b76d7d18c..88e5ca1358c 100644 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/DirectSpellChecker.java @@ -290,21 +290,22 @@ public class DirectSpellChecker { } /** - * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean) - * suggestSimilar(term, numSug, ir, false)} + * Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode) + * suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)} */ public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir) throws IOException { - return suggestSimilar(term, numSug, ir, false); + return suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); } /** - * Calls {@link #suggestSimilar(Term, int, IndexReader, boolean, float) - * suggestSimilar(term, numSug, ir, morePopular, this.accuracy)} + * Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float) + * suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)} + * */ public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, - boolean morePopular) throws IOException { - return suggestSimilar(term, numSug, ir, morePopular, accuracy); + SuggestMode suggestMode) throws IOException { + return suggestSimilar(term, numSug, ir, suggestMode, this.accuracy); } /** @@ -323,7 +324,7 @@ public class DirectSpellChecker { * @throws IOException */ public SuggestWord[] suggestSimilar(Term term, int numSug, IndexReader ir, - boolean morePopular, float accuracy) throws IOException { + SuggestMode suggestMode, float accuracy) throws IOException { final CharsRef spare = new CharsRef(); String text = term.text(); if (minQueryLength > 0 && text.codePointCount(0, text.length()) < minQueryLength) @@ -335,9 +336,7 @@ public class DirectSpellChecker { int docfreq = ir.docFreq(term); - // see line 341 of spellchecker. this is certainly very very nice for perf, - // but is it really the right way to go? - if (!morePopular && docfreq > 0) { + if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0) { return new SuggestWord[0]; } @@ -349,7 +348,7 @@ public class DirectSpellChecker { return new SuggestWord[0]; } - if (!morePopular) docfreq = 0; + if (suggestMode!=SuggestMode.SUGGEST_MORE_POPULAR) docfreq = 0; if (thresholdFrequency >= 1f) { docfreq = Math.max(docfreq, (int) thresholdFrequency); diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java b/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java index 2d04163c578..450d11bacac 100755 --- a/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java +++ b/modules/suggest/src/java/org/apache/lucene/search/spell/SpellChecker.java @@ -247,7 +247,7 @@ public class SpellChecker implements java.io.Closeable { * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) */ public String[] suggestSimilar(String word, int numSug) throws IOException { - return this.suggestSimilar(word, numSug, null, null, false); + return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); } /** @@ -271,7 +271,7 @@ public class SpellChecker implements java.io.Closeable { * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) */ public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException { - return this.suggestSimilar(word, numSug, null, null, false, accuracy); + return this.suggestSimilar(word, numSug, null, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy); } /** @@ -300,8 +300,16 @@ public class SpellChecker implements java.io.Closeable { * first criteria: the edit distance, second criteria (only if restricted mode): the popularity * of the suggest words in the field of the user index * - * @see #suggestSimilar(String, int, org.apache.lucene.index.IndexReader, String, boolean, float) + * @see #suggestSimilar(String, int, IndexReader, String, SuggestMode, float) + * + * @deprecated + * use suggestSimilar(String, int, IndexReader, String, SuggestMode) + *
As the Lucene similarity that is used to fetch the most relevant n-grammed terms + * is not the same as the edit distance strategy used to calculate the best + * matching spell-checked word from the hits that Lucene found, one usually has + * to retrieve a couple of numSug's in order to get the true best match. + * + *
I.e. if numSug == 1, don't count on that suggestion being the best one.
+ * Thus, you should set this value to at least 5 for a good suggestion.
+ *
+ * @param word the word you want a spell check done on
+ * @param numSug the number of suggested words
+ * @param ir the indexReader of the user index (can be null see field param)
+ * @param field the field of the user index: if field is not null, the suggested
+ * words are restricted to the words present in this field.
+ * @param suggestMode
+ * (NOTE: if indexReader==null and/or field==null, then this is overridden with SuggestMode.SUGGEST_ALWAYS)
+ * @param accuracy The minimum score a suggestion must have in order to qualify for inclusion in the results
+ * @throws IOException if the underlying index throws an {@link IOException}
+ * @throws AlreadyClosedException if the Spellchecker is already closed
+ * @return String[] the sorted list of the suggest words with these 2 criteria:
+ * first criteria: the edit distance, second criteria (only if restricted mode): the popularity
+ * of the suggest words in the field of the user index
+ *
+ */
+ public String[] suggestSimilar(String word, int numSug, IndexReader ir,
+ String field, SuggestMode suggestMode, float accuracy) throws IOException {
// obtainSearcher calls ensureOpen
final IndexSearcher indexSearcher = obtainSearcher();
- try{
+ try {
+ if (ir == null || field == null) {
+ suggestMode = SuggestMode.SUGGEST_ALWAYS;
+ }
+ if (suggestMode == SuggestMode.SUGGEST_ALWAYS) {
+ ir = null;
+ field = null;
+ }
final int lengthWord = word.length();
final int freq = (ir != null && field != null) ? ir.docFreq(new Term(field, word)) : 0;
- final int goalFreq = (morePopular && ir != null && field != null) ? freq : 0;
+ final int goalFreq = suggestMode==SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
// if the word exists in the real index and we don't care for word frequency, return the word itself
- if (!morePopular && freq > 0) {
+ if (suggestMode==SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) {
return new String[] { word };
}
@@ -403,7 +470,7 @@ public class SpellChecker implements java.io.Closeable {
if (ir != null && field != null) { // use the user index
sugWord.freq = ir.docFreq(new Term(field, sugWord.string)); // freq in the index
// don't suggest a word that is not present in the field
- if ((morePopular && goalFreq > sugWord.freq) || sugWord.freq < 1) {
+ if ((suggestMode==SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq) || sugWord.freq < 1) {
continue;
}
}
diff --git a/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestMode.java b/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestMode.java
new file mode 100644
index 00000000000..72b269c0b14
--- /dev/null
+++ b/modules/suggest/src/java/org/apache/lucene/search/spell/SuggestMode.java
@@ -0,0 +1,42 @@
+package org.apache.lucene.search.spell;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Set of strategies for suggesting related terms
+ * @lucene.experimental
+ */
+public enum SuggestMode {
+ /**
+ * Generate suggestions only for terms not in the index (default)
+ */
+ SUGGEST_WHEN_NOT_IN_INDEX,
+
+ /**
+ * Return only suggested words that are as frequent or more frequent than the
+ * searched word
+ */
+ SUGGEST_MORE_POPULAR,
+
+ /**
+ * Always attempt to offer suggestions (however, other parameters may limit
+ * suggestions. For example, see
+ * {@link DirectSpellChecker.setMaxQueryFrequency} ).
+ */
+ SUGGEST_ALWAYS
+}
diff --git a/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java b/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java
index 8f8d7ca9e2c..38461acb8c6 100644
--- a/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java
+++ b/modules/suggest/src/test/org/apache/lucene/search/spell/TestDirectSpellChecker.java
@@ -45,29 +45,35 @@ public class TestDirectSpellChecker extends LuceneTestCase {
IndexReader ir = writer.getReader();
- SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
+ SuggestWord[] similar = spellChecker.suggestSimilar(new Term("numbers",
+ "fvie"), 2, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
- similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir, false);
+ similar = spellChecker.suggestSimilar(new Term("numbers", "five"), 2, ir,
+ SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
if (similar.length > 0) {
assertFalse(similar[0].string.equals("five")); // don't suggest a word for itself
}
- similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir, false);
+ similar = spellChecker.suggestSimilar(new Term("numbers", "fvie"), 2, ir,
+ SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
- similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir, false);
+ similar = spellChecker.suggestSimilar(new Term("numbers", "fiv"), 2, ir,
+ SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
- similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir, false);
+ similar = spellChecker.suggestSimilar(new Term("numbers", "fives"), 2, ir,
+ SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("five", similar[0].string);
assertTrue(similar.length > 0);
- similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir, false);
+ similar = spellChecker.suggestSimilar(new Term("numbers", "fie"), 2, ir,
+ SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals("five", similar[0].string);
// add some more documents
@@ -81,7 +87,8 @@ public class TestDirectSpellChecker extends LuceneTestCase {
ir = writer.getReader();
// look ma, no spellcheck index rebuild
- similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10, ir, false);
+ similar = spellChecker.suggestSimilar(new Term("numbers", "tousand"), 10,
+ ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertTrue(similar.length > 0);
assertEquals("thousand", similar[0].string);
@@ -109,34 +116,48 @@ public class TestDirectSpellChecker extends LuceneTestCase {
DirectSpellChecker spellChecker = new DirectSpellChecker();
spellChecker.setMaxQueryFrequency(0F);
- SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 1, ir, true);
+ SuggestWord[] similar = spellChecker.suggestSimilar(new Term("text",
+ "fobar"), 1, ir, SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMinQueryLength(5);
- similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir, true);
+ similar = spellChecker.suggestSimilar(new Term("text", "foba"), 1, ir,
+ SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMaxEdits(1);
- similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
+ similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir,
+ SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setAccuracy(0.9F);
- similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir, true);
+ similar = spellChecker.suggestSimilar(new Term("text", "foobazzz"), 1, ir,
+ SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMinPrefix(0);
- similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
+ similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
+ SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(1, similar.length);
-
+ similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
+ SuggestMode.SUGGEST_MORE_POPULAR);
+
spellChecker = new DirectSpellChecker(); // reset defaults
spellChecker.setMinPrefix(1);
- similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir, true);
+ similar = spellChecker.suggestSimilar(new Term("text", "roobaz"), 1, ir,
+ SuggestMode.SUGGEST_MORE_POPULAR);
assertEquals(0, similar.length);
+ spellChecker = new DirectSpellChecker(); // reset defaults
+ spellChecker.setMaxEdits(2);
+ similar = spellChecker.suggestSimilar(new Term("text", "fobar"), 2, ir,
+ SuggestMode.SUGGEST_ALWAYS);
+ assertEquals(2, similar.length);
+
ir.close();
writer.close();
dir.close();
@@ -156,7 +177,9 @@ public class TestDirectSpellChecker extends LuceneTestCase {
IndexReader ir = writer.getReader();
- SuggestWord[] similar = spellChecker.suggestSimilar(new Term("bogusFieldBogusField", "fvie"), 2, ir, false);
+ SuggestWord[] similar = spellChecker.suggestSimilar(new Term(
+ "bogusFieldBogusField", "fvie"), 2, ir,
+ SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
assertEquals(0, similar.length);
ir.close();
writer.close();
diff --git a/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java b/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
index 0b685442739..f74468a3b5e 100755
--- a/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
+++ b/modules/suggest/src/test/org/apache/lucene/search/spell/TestSpellChecker.java
@@ -63,6 +63,29 @@ public class TestSpellChecker extends LuceneTestCase {
doc.add(newField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), TextField.TYPE_STORED)); // + word thousand
writer.addDocument(doc);
}
+ {
+ Document doc = new Document();
+ doc.add(newField("field1", "eight", TextField.TYPE_STORED)); // "eight" in
+ // the index
+ // twice
+ writer.addDocument(doc);
+ }
+ {
+ Document doc = new Document();
+ doc
+ .add(newField("field1", "twenty-one twenty-one",
+ TextField.TYPE_STORED)); // "twenty-one" in the index thrice
+ writer.addDocument(doc);
+ }
+ {
+ Document doc = new Document();
+ doc.add(newField("field1", "twenty", TextField.TYPE_STORED)); // "twenty"
+ // in the
+ // index
+ // twice
+ writer.addDocument(doc);
+ }
+
writer.close();
searchers = Collections.synchronizedList(new ArrayList