mirror of https://github.com/apache/lucene.git
LUCENE-2479: Add support for alternate comparators for spelling
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@986477 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
db4b2af8af
commit
f39ad7f02e
|
@ -20,6 +20,9 @@ New Features
|
|||
code is refactored to support append-only FS, and to allow for future
|
||||
customization of per-segment information. (Andrzej Bialecki)
|
||||
|
||||
* LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along
|
||||
with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll)
|
||||
|
||||
======================= Lucene 3.x (not yet released) =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.search.spell;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||
|
@ -103,6 +104,8 @@ public class SpellChecker implements java.io.Closeable {
|
|||
|
||||
private StringDistance sd;
|
||||
|
||||
private Comparator<SuggestWord> comparator;
|
||||
|
||||
/**
|
||||
* Use the given directory as a spell checker index. The directory
|
||||
* is created if it doesn't exist yet.
|
||||
|
@ -111,8 +114,7 @@ public class SpellChecker implements java.io.Closeable {
|
|||
* @throws IOException if Spellchecker can not open the directory
|
||||
*/
|
||||
public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
|
||||
setSpellIndex(spellIndex);
|
||||
setStringDistance(sd);
|
||||
this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
|
||||
}
|
||||
/**
|
||||
* Use the given directory as a spell checker index with a
|
||||
|
@ -128,6 +130,20 @@ public class SpellChecker implements java.io.Closeable {
|
|||
this(spellIndex, new LevensteinDistance());
|
||||
}
|
||||
|
||||
/**
|
||||
* Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure
|
||||
* and the given {@link java.util.Comparator} for sorting the results.
|
||||
* @param spellIndex The spelling index
|
||||
* @param sd The distance
|
||||
* @param comparator The comparator
|
||||
* @throws IOException if there is a problem opening the index
|
||||
*/
|
||||
public SpellChecker(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
|
||||
setSpellIndex(spellIndex);
|
||||
setStringDistance(sd);
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Use a different index as the spell checker index or re-open
|
||||
* the existing index if <code>spellIndex</code> is the same value
|
||||
|
@ -151,6 +167,15 @@ public class SpellChecker implements java.io.Closeable {
|
|||
swapSearcher(spellIndexDir);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}.
|
||||
* @param comparator the comparator
|
||||
*/
|
||||
public void setComparator(Comparator<SuggestWord> comparator) {
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the {@link StringDistance} implementation for this
|
||||
* {@link SpellChecker} instance.
|
||||
|
@ -271,7 +296,7 @@ public class SpellChecker implements java.io.Closeable {
|
|||
// System.out.println("Q: " + query);
|
||||
ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs;
|
||||
// System.out.println("HITS: " + hits.length());
|
||||
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
|
||||
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);
|
||||
|
||||
// go thru more than 'maxr' matches in case the distance filter triggers
|
||||
int stop = Math.min(hits.length, maxHits);
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -20,10 +22,13 @@ package org.apache.lucene.search.spell;
|
|||
|
||||
/**
|
||||
* SuggestWord, used in suggestSimilar method in SpellChecker class.
|
||||
* <p/>
|
||||
* Default sort is first by score, then by frequency.
|
||||
*
|
||||
*
|
||||
*/
|
||||
final class SuggestWord {
|
||||
public final class SuggestWord{
|
||||
|
||||
/**
|
||||
* the score of the word
|
||||
*/
|
||||
|
@ -39,23 +44,4 @@ final class SuggestWord {
|
|||
*/
|
||||
public String string;
|
||||
|
||||
public final int compareTo(SuggestWord a) {
|
||||
// first criteria: the edit distance
|
||||
if (score > a.score) {
|
||||
return 1;
|
||||
}
|
||||
if (score < a.score) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// second criteria (if first criteria is equal): the popularity
|
||||
if (freq > a.freq) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (freq < a.freq) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
|
||||
import java.util.Comparator;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Frequency first, then score. Must have
|
||||
*
|
||||
**/
|
||||
public class SuggestWordFrequencyComparator implements Comparator<SuggestWord> {
|
||||
|
||||
@Override
|
||||
public int compare(SuggestWord first, SuggestWord second) {
|
||||
// first criteria: the frequency
|
||||
if (first.freq > second.freq) {
|
||||
return 1;
|
||||
}
|
||||
if (first.freq < second.freq) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// second criteria (if first criteria is equal): the score
|
||||
if (first.score > second.score) {
|
||||
return 1;
|
||||
}
|
||||
if (first.score < second.score) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -20,20 +20,44 @@ package org.apache.lucene.search.spell;
|
|||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
|
||||
/**
|
||||
* Sorts SuggestWord instances
|
||||
*
|
||||
* @see org.apache.lucene.search.spell.SuggestWordScoreComparator
|
||||
* @see org.apache.lucene.search.spell.SuggestWordFrequencyComparator
|
||||
*
|
||||
*/
|
||||
final class SuggestWordQueue extends PriorityQueue<SuggestWord> {
|
||||
public final class SuggestWordQueue extends PriorityQueue<SuggestWord> {
|
||||
public static final Comparator<SuggestWord> DEFAULT_COMPARATOR = new SuggestWordScoreComparator();
|
||||
|
||||
SuggestWordQueue (int size) {
|
||||
|
||||
private Comparator<SuggestWord> comparator;
|
||||
|
||||
/**
|
||||
* Use the {@link #DEFAULT_COMPARATOR}
|
||||
* @param size The size of the queue
|
||||
*/
|
||||
public SuggestWordQueue (int size) {
|
||||
initialize(size);
|
||||
comparator = DEFAULT_COMPARATOR;
|
||||
}
|
||||
|
||||
/**
|
||||
* Specify the size of the queue and the comparator to use for sorting.
|
||||
* @param size The size
|
||||
* @param comparator The comparator.
|
||||
*/
|
||||
public SuggestWordQueue(int size, Comparator<SuggestWord> comparator){
|
||||
initialize(size);
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final boolean lessThan (SuggestWord wa, SuggestWord wb) {
|
||||
int val = wa.compareTo(wb);
|
||||
int val = comparator.compare(wa, wb);
|
||||
return val < 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.search.spell;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
|
||||
/**
|
||||
* Score first, then frequency
|
||||
*
|
||||
**/
|
||||
class SuggestWordScoreComparator implements Comparator<SuggestWord> {
|
||||
@Override
|
||||
public int compare(SuggestWord first, SuggestWord second) {
|
||||
// first criteria: the distance
|
||||
if (first.score > second.score) {
|
||||
return 1;
|
||||
}
|
||||
if (first.score < second.score) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// second criteria (if first criteria is equal): the popularity
|
||||
if (first.freq > second.freq) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (first.freq < second.freq) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.search.spell;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
|
@ -61,6 +62,7 @@ public class TestSpellChecker extends LuceneTestCase {
|
|||
Document doc = new Document();
|
||||
doc.add(new Field("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(new Field("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
|
||||
doc.add(new Field("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
writer.close();
|
||||
|
@ -85,10 +87,10 @@ public class TestSpellChecker extends LuceneTestCase {
|
|||
|
||||
spellChecker.clearIndex();
|
||||
|
||||
addwords(r, "field1");
|
||||
addwords(r, spellChecker, "field1");
|
||||
int num_field1 = this.numdoc();
|
||||
|
||||
addwords(r, "field2");
|
||||
addwords(r, spellChecker, "field2");
|
||||
int num_field2 = this.numdoc();
|
||||
|
||||
assertEquals(num_field2, num_field1 + 1);
|
||||
|
@ -110,6 +112,25 @@ public class TestSpellChecker extends LuceneTestCase {
|
|||
r.close();
|
||||
}
|
||||
|
||||
public void testComparator() throws Exception {
|
||||
IndexReader r = IndexReader.open(userindex, true);
|
||||
Directory compIdx = newDirectory(random);
|
||||
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
|
||||
addwords(r, compareSP, "field3");
|
||||
|
||||
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false);
|
||||
assertTrue(similar.length == 2);
|
||||
//five and fvei have the same score, but different frequencies.
|
||||
assertEquals("fvei", similar[0]);
|
||||
assertEquals("five", similar[1]);
|
||||
r.close();
|
||||
if (!compareSP.isClosed())
|
||||
compareSP.close();
|
||||
compIdx.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
private void checkCommonSuggestions(IndexReader r) throws IOException {
|
||||
String[] similar = spellChecker.suggestSimilar("fvie", 2);
|
||||
assertTrue(similar.length > 0);
|
||||
|
@ -204,9 +225,9 @@ public class TestSpellChecker extends LuceneTestCase {
|
|||
assertEquals(similar[1], "ninety");
|
||||
}
|
||||
|
||||
private void addwords(IndexReader r, String field) throws IOException {
|
||||
private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException {
|
||||
long time = System.currentTimeMillis();
|
||||
spellChecker.indexDictionary(new LuceneDictionary(r, field));
|
||||
sc.indexDictionary(new LuceneDictionary(r, field));
|
||||
time = System.currentTimeMillis() - time;
|
||||
//System.out.println("time to build " + field + ": " + time);
|
||||
}
|
||||
|
@ -224,9 +245,9 @@ public class TestSpellChecker extends LuceneTestCase {
|
|||
IndexReader r = IndexReader.open(userindex, true);
|
||||
spellChecker.clearIndex();
|
||||
String field = "field1";
|
||||
addwords(r, "field1");
|
||||
addwords(r, spellChecker, "field1");
|
||||
int num_field1 = this.numdoc();
|
||||
addwords(r, "field2");
|
||||
addwords(r, spellChecker, "field2");
|
||||
int num_field2 = this.numdoc();
|
||||
assertEquals(num_field2, num_field1 + 1);
|
||||
checkCommonSuggestions(r);
|
||||
|
@ -280,10 +301,10 @@ public class TestSpellChecker extends LuceneTestCase {
|
|||
final IndexReader r = IndexReader.open(userindex, true);
|
||||
spellChecker.clearIndex();
|
||||
assertEquals(2, searchers.size());
|
||||
addwords(r, "field1");
|
||||
addwords(r, spellChecker, "field1");
|
||||
assertEquals(3, searchers.size());
|
||||
int num_field1 = this.numdoc();
|
||||
addwords(r, "field2");
|
||||
addwords(r, spellChecker, "field2");
|
||||
assertEquals(4, searchers.size());
|
||||
int num_field2 = this.numdoc();
|
||||
assertEquals(num_field2, num_field1 + 1);
|
||||
|
@ -396,6 +417,10 @@ public class TestSpellChecker extends LuceneTestCase {
|
|||
super(spellIndex, sd);
|
||||
}
|
||||
|
||||
public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
|
||||
super(spellIndex, sd, comparator);
|
||||
}
|
||||
|
||||
@Override
|
||||
IndexSearcher createSearcher(Directory dir) throws IOException {
|
||||
IndexSearcher searcher = super.createSearcher(dir);
|
||||
|
|
|
@ -24,6 +24,8 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
|
||||
import org.apache.lucene.search.spell.LevensteinDistance;
|
||||
import org.apache.lucene.search.spell.StringDistance;
|
||||
import org.apache.lucene.search.spell.SuggestWord;
|
||||
import org.apache.lucene.search.spell.SuggestWordQueue;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.solr.client.solrj.response.SpellCheckResponse;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -157,59 +159,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
|
|||
}
|
||||
}
|
||||
|
||||
static class SuggestWordQueue extends PriorityQueue {
|
||||
SuggestWordQueue(int size) {
|
||||
initialize(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(Object a, Object b) {
|
||||
SuggestWord wa = (SuggestWord) a;
|
||||
SuggestWord wb = (SuggestWord) b;
|
||||
int val = wa.compareTo(wb);
|
||||
return val < 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Borrowed from Lucene SpellChecker
|
||||
*/
|
||||
static class SuggestWord {
|
||||
/**
|
||||
* the score of the word
|
||||
*/
|
||||
public float score;
|
||||
|
||||
/**
|
||||
* The freq of the word
|
||||
*/
|
||||
public int freq;
|
||||
|
||||
/**
|
||||
* the suggested word
|
||||
*/
|
||||
public String string;
|
||||
|
||||
public final int compareTo(SuggestWord a) {
|
||||
// first criteria: the edit distance
|
||||
if (score > a.score) {
|
||||
return 1;
|
||||
}
|
||||
if (score < a.score) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// second criteria (if first criteria is equal): the popularity
|
||||
if (freq > a.freq) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (freq < a.freq) {
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {
|
||||
|
|
Loading…
Reference in New Issue