diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index c857d3bea31..3275b9f2f0a 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -20,6 +20,9 @@ New Features code is refactored to support append-only FS, and to allow for future customization of per-segment information. (Andrzej Bialecki) + * LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along + with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll) + ======================= Lucene 3.x (not yet released) ======================= Changes in backwards compatibility policy diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java index c35acd073dc..973592b7bde 100755 --- a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java +++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SpellChecker.java @@ -18,6 +18,7 @@ package org.apache.lucene.search.spell; */ import java.io.IOException; +import java.util.Comparator; import java.util.Iterator; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; @@ -103,6 +104,8 @@ public class SpellChecker implements java.io.Closeable { private StringDistance sd; + private Comparator comparator; + /** * Use the given directory as a spell checker index. The directory * is created if it doesn't exist yet. @@ -111,8 +114,7 @@ public class SpellChecker implements java.io.Closeable { * @throws IOException if Spellchecker can not open the directory */ public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException { - setSpellIndex(spellIndex); - setStringDistance(sd); + this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR); } /** * Use the given directory as a spell checker index with a @@ -127,6 +129,20 @@ public class SpellChecker implements java.io.Closeable { public SpellChecker(Directory spellIndex) throws IOException { this(spellIndex, new LevensteinDistance()); } + + /** + * Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure + * and the given {@link java.util.Comparator} for sorting the results. + * @param spellIndex The spelling index + * @param sd The distance + * @param comparator The comparator + * @throws IOException if there is a problem opening the index + */ + public SpellChecker(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException { + setSpellIndex(spellIndex); + setStringDistance(sd); + this.comparator = comparator; + } /** * Use a different index as the spell checker index or re-open @@ -151,6 +167,15 @@ public class SpellChecker implements java.io.Closeable { swapSearcher(spellIndexDir); } } + + /** + * Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}. + * @param comparator the comparator + */ + public void setComparator(Comparator comparator) { + this.comparator = comparator; + } + /** * Sets the {@link StringDistance} implementation for this * {@link SpellChecker} instance. @@ -271,7 +296,7 @@ public class SpellChecker implements java.io.Closeable { // System.out.println("Q: " + query); ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs; // System.out.println("HITS: " + hits.length()); - SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); + SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator); // go thru more than 'maxr' matches in case the distance filter triggers int stop = Math.min(hits.length, maxHits); diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java index acb980a1322..6a88818110a 100755 --- a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java +++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWord.java @@ -1,5 +1,7 @@ package org.apache.lucene.search.spell; +import java.util.Comparator; + /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,10 +22,13 @@ package org.apache.lucene.search.spell; /** * SuggestWord, used in suggestSimilar method in SpellChecker class. + *

+ * Default sort is first by score, then by frequency. * * */ -final class SuggestWord { +public final class SuggestWord{ + /** * the score of the word */ @@ -39,23 +44,4 @@ final class SuggestWord { */ public String string; - public final int compareTo(SuggestWord a) { - // first criteria: the edit distance - if (score > a.score) { - return 1; - } - if (score < a.score) { - return -1; - } - - // second criteria (if first criteria is equal): the popularity - if (freq > a.freq) { - return 1; - } - - if (freq < a.freq) { - return -1; - } - return 0; - } -} +} \ No newline at end of file diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java new file mode 100644 index 00000000000..ae9b99498c1 --- /dev/null +++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordFrequencyComparator.java @@ -0,0 +1,47 @@ +package org.apache.lucene.search.spell; + +import java.util.Comparator; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/** + * Frequency first, then score. Must have + * + **/ +public class SuggestWordFrequencyComparator implements Comparator { + + @Override + public int compare(SuggestWord first, SuggestWord second) { + // first criteria: the frequency + if (first.freq > second.freq) { + return 1; + } + if (first.freq < second.freq) { + return -1; + } + + // second criteria (if first criteria is equal): the score + if (first.score > second.score) { + return 1; + } + if (first.score < second.score) { + return -1; + } + return 0; + } +} diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java index c1c9dfd259f..af3f9d0e055 100755 --- a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java +++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordQueue.java @@ -20,20 +20,44 @@ package org.apache.lucene.search.spell; import org.apache.lucene.util.PriorityQueue; +import java.util.Comparator; + /** * Sorts SuggestWord instances * + * @see org.apache.lucene.search.spell.SuggestWordScoreComparator + * @see org.apache.lucene.search.spell.SuggestWordFrequencyComparator + * */ -final class SuggestWordQueue extends PriorityQueue { +public final class SuggestWordQueue extends PriorityQueue { + public static final Comparator DEFAULT_COMPARATOR = new SuggestWordScoreComparator(); - SuggestWordQueue (int size) { + + private Comparator comparator; + + /** + * Use the {@link #DEFAULT_COMPARATOR} + * @param size The size of the queue + */ + public SuggestWordQueue (int size) { initialize(size); + comparator = DEFAULT_COMPARATOR; + } + + /** + * Specify the size of the queue and the comparator to use for sorting. + * @param size The size + * @param comparator The comparator. + */ + public SuggestWordQueue(int size, Comparator comparator){ + initialize(size); + this.comparator = comparator; } @Override protected final boolean lessThan (SuggestWord wa, SuggestWord wb) { - int val = wa.compareTo(wb); + int val = comparator.compare(wa, wb); return val < 0; } } diff --git a/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java new file mode 100644 index 00000000000..ad46a2d3eb3 --- /dev/null +++ b/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/spell/SuggestWordScoreComparator.java @@ -0,0 +1,47 @@ +package org.apache.lucene.search.spell; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Comparator; + + +/** + * Score first, then frequency + * + **/ +class SuggestWordScoreComparator implements Comparator { + @Override + public int compare(SuggestWord first, SuggestWord second) { + // first criteria: the distance + if (first.score > second.score) { + return 1; + } + if (first.score < second.score) { + return -1; + } + + // second criteria (if first criteria is equal): the popularity + if (first.freq > second.freq) { + return 1; + } + + if (first.freq < second.freq) { + return -1; + } + return 0; + } +} diff --git a/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java b/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java index 53268244ea6..0f0a053c3a5 100755 --- a/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java +++ b/lucene/contrib/spellchecker/src/test/org/apache/lucene/search/spell/TestSpellChecker.java @@ -20,6 +20,7 @@ package org.apache.lucene.search.spell; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; +import java.util.Comparator; import java.util.List; import java.util.Random; import java.util.concurrent.ExecutorService; @@ -61,6 +62,7 @@ public class TestSpellChecker extends LuceneTestCase { Document doc = new Document(); doc.add(new Field("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand + doc.add(new Field("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand writer.addDocument(doc); } writer.close(); @@ -85,10 +87,10 @@ public class TestSpellChecker extends LuceneTestCase { spellChecker.clearIndex(); - addwords(r, "field1"); + addwords(r, spellChecker, "field1"); int num_field1 = this.numdoc(); - addwords(r, "field2"); + addwords(r, spellChecker, "field2"); int num_field2 = this.numdoc(); assertEquals(num_field2, num_field1 + 1); @@ -110,6 +112,25 @@ public class TestSpellChecker extends LuceneTestCase { r.close(); } + public void testComparator() throws Exception { + IndexReader r = IndexReader.open(userindex, true); + Directory compIdx = newDirectory(random); + SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator()); + addwords(r, compareSP, "field3"); + + String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false); + assertTrue(similar.length == 2); + //five and fvei have the same score, but different frequencies. + assertEquals("fvei", similar[0]); + assertEquals("five", similar[1]); + r.close(); + if (!compareSP.isClosed()) + compareSP.close(); + compIdx.close(); + + + } + private void checkCommonSuggestions(IndexReader r) throws IOException { String[] similar = spellChecker.suggestSimilar("fvie", 2); assertTrue(similar.length > 0); @@ -204,9 +225,9 @@ public class TestSpellChecker extends LuceneTestCase { assertEquals(similar[1], "ninety"); } - private void addwords(IndexReader r, String field) throws IOException { + private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException { long time = System.currentTimeMillis(); - spellChecker.indexDictionary(new LuceneDictionary(r, field)); + sc.indexDictionary(new LuceneDictionary(r, field)); time = System.currentTimeMillis() - time; //System.out.println("time to build " + field + ": " + time); } @@ -224,9 +245,9 @@ public class TestSpellChecker extends LuceneTestCase { IndexReader r = IndexReader.open(userindex, true); spellChecker.clearIndex(); String field = "field1"; - addwords(r, "field1"); + addwords(r, spellChecker, "field1"); int num_field1 = this.numdoc(); - addwords(r, "field2"); + addwords(r, spellChecker, "field2"); int num_field2 = this.numdoc(); assertEquals(num_field2, num_field1 + 1); checkCommonSuggestions(r); @@ -280,10 +301,10 @@ public class TestSpellChecker extends LuceneTestCase { final IndexReader r = IndexReader.open(userindex, true); spellChecker.clearIndex(); assertEquals(2, searchers.size()); - addwords(r, "field1"); + addwords(r, spellChecker, "field1"); assertEquals(3, searchers.size()); int num_field1 = this.numdoc(); - addwords(r, "field2"); + addwords(r, spellChecker, "field2"); assertEquals(4, searchers.size()); int num_field2 = this.numdoc(); assertEquals(num_field2, num_field1 + 1); @@ -396,6 +417,10 @@ public class TestSpellChecker extends LuceneTestCase { super(spellIndex, sd); } + public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator comparator) throws IOException { + super(spellIndex, sd, comparator); + } + @Override IndexSearcher createSearcher(Directory dir) throws IOException { IndexSearcher searcher = super.createSearcher(dir); diff --git a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java index f5fdd8005ee..be5a7731486 100644 --- a/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java +++ b/solr/src/java/org/apache/solr/handler/component/SpellCheckComponent.java @@ -24,6 +24,8 @@ import java.util.concurrent.ConcurrentHashMap; import org.apache.lucene.search.spell.LevensteinDistance; import org.apache.lucene.search.spell.StringDistance; +import org.apache.lucene.search.spell.SuggestWord; +import org.apache.lucene.search.spell.SuggestWordQueue; import org.apache.lucene.util.PriorityQueue; import org.apache.solr.client.solrj.response.SpellCheckResponse; import org.slf4j.Logger; @@ -157,59 +159,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar } } - static class SuggestWordQueue extends PriorityQueue { - SuggestWordQueue(int size) { - initialize(size); - } - @Override - protected boolean lessThan(Object a, Object b) { - SuggestWord wa = (SuggestWord) a; - SuggestWord wb = (SuggestWord) b; - int val = wa.compareTo(wb); - return val < 0; - } - } - - /** - * Borrowed from Lucene SpellChecker - */ - static class SuggestWord { - /** - * the score of the word - */ - public float score; - - /** - * The freq of the word - */ - public int freq; - - /** - * the suggested word - */ - public String string; - - public final int compareTo(SuggestWord a) { - // first criteria: the edit distance - if (score > a.score) { - return 1; - } - if (score < a.score) { - return -1; - } - - // second criteria (if first criteria is equal): the popularity - if (freq > a.freq) { - return 1; - } - - if (freq < a.freq) { - return -1; - } - return 0; - } - } @Override public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {