LUCENE-2479: Add support for alternate comparators for spelling

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@986477 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2010-08-17 20:40:58 +00:00
parent db4b2af8af
commit f39ad7f02e
8 changed files with 194 additions and 87 deletions

View File

@ -20,6 +20,9 @@ New Features
code is refactored to support append-only FS, and to allow for future code is refactored to support append-only FS, and to allow for future
customization of per-segment information. (Andrzej Bialecki) customization of per-segment information. (Andrzej Bialecki)
* LUCENE-2479: Added ability to provide a sort comparator for spelling suggestions along
with two implementations. The existing comparator (score, then frequency) is the default (Grant Ingersoll)
======================= Lucene 3.x (not yet released) ======================= ======================= Lucene 3.x (not yet released) =======================
Changes in backwards compatibility policy Changes in backwards compatibility policy

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search.spell;
*/ */
import java.io.IOException; import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator; import java.util.Iterator;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
@ -103,6 +104,8 @@ public class SpellChecker implements java.io.Closeable {
private StringDistance sd; private StringDistance sd;
private Comparator<SuggestWord> comparator;
/** /**
* Use the given directory as a spell checker index. The directory * Use the given directory as a spell checker index. The directory
* is created if it doesn't exist yet. * is created if it doesn't exist yet.
@ -111,8 +114,7 @@ public class SpellChecker implements java.io.Closeable {
* @throws IOException if Spellchecker can not open the directory * @throws IOException if Spellchecker can not open the directory
*/ */
public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException { public SpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
setSpellIndex(spellIndex); this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
setStringDistance(sd);
} }
/** /**
* Use the given directory as a spell checker index with a * Use the given directory as a spell checker index with a
@ -128,6 +130,20 @@ public class SpellChecker implements java.io.Closeable {
this(spellIndex, new LevensteinDistance()); this(spellIndex, new LevensteinDistance());
} }
/**
* Use the given directory as a spell checker index with the given {@link org.apache.lucene.search.spell.StringDistance} measure
* and the given {@link java.util.Comparator} for sorting the results.
* @param spellIndex The spelling index
* @param sd The distance
* @param comparator The comparator
* @throws IOException if there is a problem opening the index
*/
public SpellChecker(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
setSpellIndex(spellIndex);
setStringDistance(sd);
this.comparator = comparator;
}
/** /**
* Use a different index as the spell checker index or re-open * Use a different index as the spell checker index or re-open
* the existing index if <code>spellIndex</code> is the same value * the existing index if <code>spellIndex</code> is the same value
@ -151,6 +167,15 @@ public class SpellChecker implements java.io.Closeable {
swapSearcher(spellIndexDir); swapSearcher(spellIndexDir);
} }
} }
/**
* Sets the {@link java.util.Comparator} for the {@link SuggestWordQueue}.
* @param comparator the comparator
*/
public void setComparator(Comparator<SuggestWord> comparator) {
this.comparator = comparator;
}
/** /**
* Sets the {@link StringDistance} implementation for this * Sets the {@link StringDistance} implementation for this
* {@link SpellChecker} instance. * {@link SpellChecker} instance.
@ -271,7 +296,7 @@ public class SpellChecker implements java.io.Closeable {
// System.out.println("Q: " + query); // System.out.println("Q: " + query);
ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs; ScoreDoc[] hits = indexSearcher.search(query, null, maxHits).scoreDocs;
// System.out.println("HITS: " + hits.length()); // System.out.println("HITS: " + hits.length());
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug); SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, comparator);
// go thru more than 'maxr' matches in case the distance filter triggers // go thru more than 'maxr' matches in case the distance filter triggers
int stop = Math.min(hits.length, maxHits); int stop = Math.min(hits.length, maxHits);

View File

@ -1,5 +1,7 @@
package org.apache.lucene.search.spell; package org.apache.lucene.search.spell;
import java.util.Comparator;
/** /**
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -20,10 +22,13 @@ package org.apache.lucene.search.spell;
/** /**
* SuggestWord, used in suggestSimilar method in SpellChecker class. * SuggestWord, used in suggestSimilar method in SpellChecker class.
* <p/>
* Default sort is first by score, then by frequency.
* *
* *
*/ */
final class SuggestWord { public final class SuggestWord{
/** /**
* the score of the word * the score of the word
*/ */
@ -39,23 +44,4 @@ final class SuggestWord {
*/ */
public String string; public String string;
public final int compareTo(SuggestWord a) {
// first criteria: the edit distance
if (score > a.score) {
return 1;
}
if (score < a.score) {
return -1;
}
// second criteria (if first criteria is equal): the popularity
if (freq > a.freq) {
return 1;
}
if (freq < a.freq) {
return -1;
}
return 0;
}
} }

View File

@ -0,0 +1,47 @@
package org.apache.lucene.search.spell;
import java.util.Comparator;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Frequency first, then score. Must have
*
**/
public class SuggestWordFrequencyComparator implements Comparator<SuggestWord> {
@Override
public int compare(SuggestWord first, SuggestWord second) {
// first criteria: the frequency
if (first.freq > second.freq) {
return 1;
}
if (first.freq < second.freq) {
return -1;
}
// second criteria (if first criteria is equal): the score
if (first.score > second.score) {
return 1;
}
if (first.score < second.score) {
return -1;
}
return 0;
}
}

View File

@ -20,20 +20,44 @@ package org.apache.lucene.search.spell;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
import java.util.Comparator;
/** /**
* Sorts SuggestWord instances * Sorts SuggestWord instances
* *
* @see org.apache.lucene.search.spell.SuggestWordScoreComparator
* @see org.apache.lucene.search.spell.SuggestWordFrequencyComparator
*
*/ */
final class SuggestWordQueue extends PriorityQueue<SuggestWord> { public final class SuggestWordQueue extends PriorityQueue<SuggestWord> {
public static final Comparator<SuggestWord> DEFAULT_COMPARATOR = new SuggestWordScoreComparator();
SuggestWordQueue (int size) {
private Comparator<SuggestWord> comparator;
/**
* Use the {@link #DEFAULT_COMPARATOR}
* @param size The size of the queue
*/
public SuggestWordQueue (int size) {
initialize(size); initialize(size);
comparator = DEFAULT_COMPARATOR;
}
/**
* Specify the size of the queue and the comparator to use for sorting.
* @param size The size
* @param comparator The comparator.
*/
public SuggestWordQueue(int size, Comparator<SuggestWord> comparator){
initialize(size);
this.comparator = comparator;
} }
@Override @Override
protected final boolean lessThan (SuggestWord wa, SuggestWord wb) { protected final boolean lessThan (SuggestWord wa, SuggestWord wb) {
int val = wa.compareTo(wb); int val = comparator.compare(wa, wb);
return val < 0; return val < 0;
} }
} }

View File

@ -0,0 +1,47 @@
package org.apache.lucene.search.spell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Comparator;
/**
* Score first, then frequency
*
**/
class SuggestWordScoreComparator implements Comparator<SuggestWord> {
@Override
public int compare(SuggestWord first, SuggestWord second) {
// first criteria: the distance
if (first.score > second.score) {
return 1;
}
if (first.score < second.score) {
return -1;
}
// second criteria (if first criteria is equal): the popularity
if (first.freq > second.freq) {
return 1;
}
if (first.freq < second.freq) {
return -1;
}
return 0;
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.spell;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator;
import java.util.List; import java.util.List;
import java.util.Random; import java.util.Random;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
@ -61,6 +62,7 @@ public class TestSpellChecker extends LuceneTestCase {
Document doc = new Document(); Document doc = new Document();
doc.add(new Field("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("field1", English.intToEnglish(i), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand doc.add(new Field("field2", English.intToEnglish(i + 1), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
doc.add(new Field("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES, Field.Index.ANALYZED)); // + word thousand
writer.addDocument(doc); writer.addDocument(doc);
} }
writer.close(); writer.close();
@ -85,10 +87,10 @@ public class TestSpellChecker extends LuceneTestCase {
spellChecker.clearIndex(); spellChecker.clearIndex();
addwords(r, "field1"); addwords(r, spellChecker, "field1");
int num_field1 = this.numdoc(); int num_field1 = this.numdoc();
addwords(r, "field2"); addwords(r, spellChecker, "field2");
int num_field2 = this.numdoc(); int num_field2 = this.numdoc();
assertEquals(num_field2, num_field1 + 1); assertEquals(num_field2, num_field1 + 1);
@ -110,6 +112,25 @@ public class TestSpellChecker extends LuceneTestCase {
r.close(); r.close();
} }
public void testComparator() throws Exception {
IndexReader r = IndexReader.open(userindex, true);
Directory compIdx = newDirectory(random);
SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator());
addwords(r, compareSP, "field3");
String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", false);
assertTrue(similar.length == 2);
//five and fvei have the same score, but different frequencies.
assertEquals("fvei", similar[0]);
assertEquals("five", similar[1]);
r.close();
if (!compareSP.isClosed())
compareSP.close();
compIdx.close();
}
private void checkCommonSuggestions(IndexReader r) throws IOException { private void checkCommonSuggestions(IndexReader r) throws IOException {
String[] similar = spellChecker.suggestSimilar("fvie", 2); String[] similar = spellChecker.suggestSimilar("fvie", 2);
assertTrue(similar.length > 0); assertTrue(similar.length > 0);
@ -204,9 +225,9 @@ public class TestSpellChecker extends LuceneTestCase {
assertEquals(similar[1], "ninety"); assertEquals(similar[1], "ninety");
} }
private void addwords(IndexReader r, String field) throws IOException { private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException {
long time = System.currentTimeMillis(); long time = System.currentTimeMillis();
spellChecker.indexDictionary(new LuceneDictionary(r, field)); sc.indexDictionary(new LuceneDictionary(r, field));
time = System.currentTimeMillis() - time; time = System.currentTimeMillis() - time;
//System.out.println("time to build " + field + ": " + time); //System.out.println("time to build " + field + ": " + time);
} }
@ -224,9 +245,9 @@ public class TestSpellChecker extends LuceneTestCase {
IndexReader r = IndexReader.open(userindex, true); IndexReader r = IndexReader.open(userindex, true);
spellChecker.clearIndex(); spellChecker.clearIndex();
String field = "field1"; String field = "field1";
addwords(r, "field1"); addwords(r, spellChecker, "field1");
int num_field1 = this.numdoc(); int num_field1 = this.numdoc();
addwords(r, "field2"); addwords(r, spellChecker, "field2");
int num_field2 = this.numdoc(); int num_field2 = this.numdoc();
assertEquals(num_field2, num_field1 + 1); assertEquals(num_field2, num_field1 + 1);
checkCommonSuggestions(r); checkCommonSuggestions(r);
@ -280,10 +301,10 @@ public class TestSpellChecker extends LuceneTestCase {
final IndexReader r = IndexReader.open(userindex, true); final IndexReader r = IndexReader.open(userindex, true);
spellChecker.clearIndex(); spellChecker.clearIndex();
assertEquals(2, searchers.size()); assertEquals(2, searchers.size());
addwords(r, "field1"); addwords(r, spellChecker, "field1");
assertEquals(3, searchers.size()); assertEquals(3, searchers.size());
int num_field1 = this.numdoc(); int num_field1 = this.numdoc();
addwords(r, "field2"); addwords(r, spellChecker, "field2");
assertEquals(4, searchers.size()); assertEquals(4, searchers.size());
int num_field2 = this.numdoc(); int num_field2 = this.numdoc();
assertEquals(num_field2, num_field1 + 1); assertEquals(num_field2, num_field1 + 1);
@ -396,6 +417,10 @@ public class TestSpellChecker extends LuceneTestCase {
super(spellIndex, sd); super(spellIndex, sd);
} }
public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
super(spellIndex, sd, comparator);
}
@Override @Override
IndexSearcher createSearcher(Directory dir) throws IOException { IndexSearcher createSearcher(Directory dir) throws IOException {
IndexSearcher searcher = super.createSearcher(dir); IndexSearcher searcher = super.createSearcher(dir);

View File

@ -24,6 +24,8 @@ import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.search.spell.LevensteinDistance; import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.search.spell.StringDistance; import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
import org.apache.solr.client.solrj.response.SpellCheckResponse; import org.apache.solr.client.solrj.response.SpellCheckResponse;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -157,59 +159,7 @@ public class SpellCheckComponent extends SearchComponent implements SolrCoreAwar
} }
} }
static class SuggestWordQueue extends PriorityQueue {
SuggestWordQueue(int size) {
initialize(size);
}
@Override
protected boolean lessThan(Object a, Object b) {
SuggestWord wa = (SuggestWord) a;
SuggestWord wb = (SuggestWord) b;
int val = wa.compareTo(wb);
return val < 0;
}
}
/**
* Borrowed from Lucene SpellChecker
*/
static class SuggestWord {
/**
* the score of the word
*/
public float score;
/**
* The freq of the word
*/
public int freq;
/**
* the suggested word
*/
public String string;
public final int compareTo(SuggestWord a) {
// first criteria: the edit distance
if (score > a.score) {
return 1;
}
if (score < a.score) {
return -1;
}
// second criteria (if first criteria is equal): the popularity
if (freq > a.freq) {
return 1;
}
if (freq < a.freq) {
return -1;
}
return 0;
}
}
@Override @Override
public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) { public void modifyRequest(ResponseBuilder rb, SearchComponent who, ShardRequest sreq) {