mirror of https://github.com/apache/lucene.git
LUCENE-5795: MoreLikeThisQuery now only collects the top N terms
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1609474 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8f7dc8d07b
commit
173a44e67c
|
@ -143,6 +143,10 @@ Optimizations
|
||||||
to another analyzer, e.g. per field name: PerFieldAnalyzerWrapper and
|
to another analyzer, e.g. per field name: PerFieldAnalyzerWrapper and
|
||||||
Solr's schema support. (Shay Banon, Uwe Schindler, Robert Muir)
|
Solr's schema support. (Shay Banon, Uwe Schindler, Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-5795: MoreLikeThisQuery now only collects the top N terms instead
|
||||||
|
of collecting all terms from the like text when building the query.
|
||||||
|
(Alex Ksikes, Simon Willnauer)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
* LUCENE-5796: Fixes the Scorer.getChildren() method for two combinations
|
* LUCENE-5796: Fixes the Scorer.getChildren() method for two combinations
|
||||||
|
|
|
@ -604,22 +604,19 @@ public final class MoreLikeThis {
|
||||||
/**
|
/**
|
||||||
* Create the More like query from a PriorityQueue
|
* Create the More like query from a PriorityQueue
|
||||||
*/
|
*/
|
||||||
private Query createQuery(PriorityQueue<Object[]> q) {
|
private Query createQuery(PriorityQueue<ScoreTerm> q) {
|
||||||
BooleanQuery query = new BooleanQuery();
|
BooleanQuery query = new BooleanQuery();
|
||||||
Object cur;
|
ScoreTerm scoreTerm;
|
||||||
int qterms = 0;
|
float bestScore = -1;
|
||||||
float bestScore = 0;
|
|
||||||
|
|
||||||
while ((cur = q.pop()) != null) {
|
while ((scoreTerm = q.pop()) != null) {
|
||||||
Object[] ar = (Object[]) cur;
|
TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word));
|
||||||
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
|
|
||||||
|
|
||||||
if (boost) {
|
if (boost) {
|
||||||
if (qterms == 0) {
|
if (bestScore == -1) {
|
||||||
bestScore = ((Float) ar[2]);
|
bestScore = (scoreTerm.score);
|
||||||
}
|
}
|
||||||
float myScore = ((Float) ar[2]);
|
float myScore = (scoreTerm.score);
|
||||||
|
|
||||||
tq.setBoost(boostFactor * myScore / bestScore);
|
tq.setBoost(boostFactor * myScore / bestScore);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -629,13 +626,7 @@ public final class MoreLikeThis {
|
||||||
catch (BooleanQuery.TooManyClauses ignore) {
|
catch (BooleanQuery.TooManyClauses ignore) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
qterms++;
|
|
||||||
if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return query;
|
return query;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -644,10 +635,11 @@ public final class MoreLikeThis {
|
||||||
*
|
*
|
||||||
* @param words a map of words keyed on the word(String) with Int objects as the values.
|
* @param words a map of words keyed on the word(String) with Int objects as the values.
|
||||||
*/
|
*/
|
||||||
private PriorityQueue<Object[]> createQueue(Map<String, Int> words) throws IOException {
|
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
|
||||||
// have collected all words in doc and their freqs
|
// have collected all words in doc and their freqs
|
||||||
int numDocs = ir.numDocs();
|
int numDocs = ir.numDocs();
|
||||||
FreqQ res = new FreqQ(words.size()); // will order words by score
|
final int limit = Math.min(maxQueryTerms, words.size());
|
||||||
|
FreqQ queue = new FreqQ(limit); // will order words by score
|
||||||
|
|
||||||
for (String word : words.keySet()) { // for every word
|
for (String word : words.keySet()) { // for every word
|
||||||
int tf = words.get(word).x; // term freq in the source doc
|
int tf = words.get(word).x; // term freq in the source doc
|
||||||
|
@ -679,16 +671,18 @@ public final class MoreLikeThis {
|
||||||
float idf = similarity.idf(docFreq, numDocs);
|
float idf = similarity.idf(docFreq, numDocs);
|
||||||
float score = tf * idf;
|
float score = tf * idf;
|
||||||
|
|
||||||
// only really need 1st 3 entries, other ones are for troubleshooting
|
if (queue.size() < limit) {
|
||||||
res.insertWithOverflow(new Object[]{word, // the word
|
// there is still space in the queue
|
||||||
topField, // the top field
|
queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
|
||||||
score, // overall score
|
} else {
|
||||||
idf, // idf
|
ScoreTerm term = queue.top();
|
||||||
docFreq, // freq in all docs
|
if (term.score < score) { // update the smallest in the queue in place and update the queue.
|
||||||
tf
|
term.update(word, topField, score, idf, docFreq, tf);
|
||||||
});
|
queue.updateTop();
|
||||||
}
|
}
|
||||||
return res;
|
}
|
||||||
|
}
|
||||||
|
return queue;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -717,7 +711,7 @@ public final class MoreLikeThis {
|
||||||
*
|
*
|
||||||
* @param docNum the id of the lucene document from which to find terms
|
* @param docNum the id of the lucene document from which to find terms
|
||||||
*/
|
*/
|
||||||
public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
|
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
|
||||||
Map<String, Int> termFreqMap = new HashMap<>();
|
Map<String, Int> termFreqMap = new HashMap<>();
|
||||||
for (String fieldName : fieldNames) {
|
for (String fieldName : fieldNames) {
|
||||||
final Fields vectors = ir.getTermVectors(docNum);
|
final Fields vectors = ir.getTermVectors(docNum);
|
||||||
|
@ -857,7 +851,7 @@ public final class MoreLikeThis {
|
||||||
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
|
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
|
||||||
* @see #retrieveInterestingTerms
|
* @see #retrieveInterestingTerms
|
||||||
*/
|
*/
|
||||||
public PriorityQueue<Object[]> retrieveTerms(Reader r, String fieldName) throws IOException {
|
private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
|
||||||
Map<String, Int> words = new HashMap<>();
|
Map<String, Int> words = new HashMap<>();
|
||||||
addTermFrequencies(r, words, fieldName);
|
addTermFrequencies(r, words, fieldName);
|
||||||
return createQueue(words);
|
return createQueue(words);
|
||||||
|
@ -868,13 +862,12 @@ public final class MoreLikeThis {
|
||||||
*/
|
*/
|
||||||
public String[] retrieveInterestingTerms(int docNum) throws IOException {
|
public String[] retrieveInterestingTerms(int docNum) throws IOException {
|
||||||
ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
|
ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
|
||||||
PriorityQueue<Object[]> pq = retrieveTerms(docNum);
|
PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
|
||||||
Object cur;
|
ScoreTerm scoreTerm;
|
||||||
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
||||||
// we just want to return the top words
|
// we just want to return the top words
|
||||||
while (((cur = pq.pop()) != null) && lim-- > 0) {
|
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
|
||||||
Object[] ar = (Object[]) cur;
|
al.add(scoreTerm.word); // the 1st entry is the interesting word
|
||||||
al.add(ar[0]); // the 1st entry is the interesting word
|
|
||||||
}
|
}
|
||||||
String[] res = new String[al.size()];
|
String[] res = new String[al.size()];
|
||||||
return al.toArray(res);
|
return al.toArray(res);
|
||||||
|
@ -892,13 +885,12 @@ public final class MoreLikeThis {
|
||||||
*/
|
*/
|
||||||
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
|
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
|
||||||
ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
|
ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
|
||||||
PriorityQueue<Object[]> pq = retrieveTerms(r, fieldName);
|
PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
|
||||||
Object cur;
|
ScoreTerm scoreTerm;
|
||||||
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
||||||
// we just want to return the top words
|
// we just want to return the top words
|
||||||
while (((cur = pq.pop()) != null) && lim-- > 0) {
|
while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
|
||||||
Object[] ar = (Object[]) cur;
|
al.add(scoreTerm.word); // the 1st entry is the interesting word
|
||||||
al.add(ar[0]); // the 1st entry is the interesting word
|
|
||||||
}
|
}
|
||||||
String[] res = new String[al.size()];
|
String[] res = new String[al.size()];
|
||||||
return al.toArray(res);
|
return al.toArray(res);
|
||||||
|
@ -907,16 +899,42 @@ public final class MoreLikeThis {
|
||||||
/**
|
/**
|
||||||
* PriorityQueue that orders words by score.
|
* PriorityQueue that orders words by score.
|
||||||
*/
|
*/
|
||||||
private static class FreqQ extends PriorityQueue<Object[]> {
|
private static class FreqQ extends PriorityQueue<ScoreTerm> {
|
||||||
FreqQ(int s) {
|
FreqQ(int maxSize) {
|
||||||
super(s);
|
super(maxSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean lessThan(Object[] aa, Object[] bb) {
|
protected boolean lessThan(ScoreTerm a, ScoreTerm b) {
|
||||||
Float fa = (Float) aa[2];
|
return a.score < b.score;
|
||||||
Float fb = (Float) bb[2];
|
}
|
||||||
return fa > fb;
|
}
|
||||||
|
|
||||||
|
private static class ScoreTerm {
|
||||||
|
// only really need 1st 3 entries, other ones are for troubleshooting
|
||||||
|
String word;
|
||||||
|
String topField;
|
||||||
|
float score;
|
||||||
|
float idf;
|
||||||
|
int docFreq;
|
||||||
|
int tf;
|
||||||
|
|
||||||
|
ScoreTerm(String word, String topField, float score, float idf, int docFreq, int tf) {
|
||||||
|
this.word = word;
|
||||||
|
this.topField = topField;
|
||||||
|
this.score = score;
|
||||||
|
this.idf = idf;
|
||||||
|
this.docFreq = docFreq;
|
||||||
|
this.tf = tf;
|
||||||
|
}
|
||||||
|
|
||||||
|
void update(String word, String topField, float score, float idf, int docFreq, int tf) {
|
||||||
|
this.word = word;
|
||||||
|
this.topField = topField;
|
||||||
|
this.score = score;
|
||||||
|
this.idf = idf;
|
||||||
|
this.docFreq = docFreq;
|
||||||
|
this.tf = tf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -75,6 +75,14 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void addDoc(RandomIndexWriter writer, String[] texts) throws IOException {
|
||||||
|
Document doc = new Document();
|
||||||
|
for (String text : texts) {
|
||||||
|
doc.add(newTextField("text", text, Field.Store.YES));
|
||||||
|
}
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
public void testBoostFactor() throws Throwable {
|
public void testBoostFactor() throws Throwable {
|
||||||
Map<String,Float> originalValues = getOriginalValues();
|
Map<String,Float> originalValues = getOriginalValues();
|
||||||
|
|
||||||
|
@ -166,5 +174,62 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
||||||
Query query = new MoreLikeThisQuery("this is a test", new String[] { "text" }, new MockAnalyzer(random()), "text");
|
Query query = new MoreLikeThisQuery("this is a test", new String[] { "text" }, new MockAnalyzer(random()), "text");
|
||||||
QueryUtils.check(random(), query, searcher);
|
QueryUtils.check(random(), query, searcher);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testTopN() throws Exception {
|
||||||
|
int numDocs = 100;
|
||||||
|
int topN = 25;
|
||||||
|
|
||||||
|
// add series of docs with terms of decreasing df
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
addDoc(writer, generateStrSeq(0, i + 1));
|
||||||
|
}
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
writer.shutdown();
|
||||||
|
|
||||||
|
// setup MLT query
|
||||||
|
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||||
|
mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
|
||||||
|
mlt.setMaxQueryTerms(topN);
|
||||||
|
mlt.setMinDocFreq(1);
|
||||||
|
mlt.setMinTermFreq(1);
|
||||||
|
mlt.setMinWordLen(1);
|
||||||
|
mlt.setFieldNames(new String[]{"text"});
|
||||||
|
|
||||||
|
// perform MLT query
|
||||||
|
String likeText = "";
|
||||||
|
for (String text : generateStrSeq(0, numDocs)) {
|
||||||
|
likeText += text + " ";
|
||||||
|
}
|
||||||
|
BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText));
|
||||||
|
|
||||||
|
// check best terms are topN of highest idf
|
||||||
|
List<BooleanClause> clauses = query.clauses();
|
||||||
|
assertEquals("Expected" + topN + "clauses only!", topN, clauses.size());
|
||||||
|
|
||||||
|
Term[] expectedTerms = new Term[topN];
|
||||||
|
int idx = 0;
|
||||||
|
for (String text : generateStrSeq(numDocs - topN, topN)) {
|
||||||
|
expectedTerms[idx++] = new Term("text", text);
|
||||||
|
}
|
||||||
|
for (BooleanClause clause : clauses) {
|
||||||
|
Term term = ((TermQuery) clause.getQuery()).getTerm();
|
||||||
|
assertTrue(Arrays.asList(expectedTerms).contains(term));
|
||||||
|
}
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private String[] generateStrSeq(int from, int size) {
|
||||||
|
String[] generatedStrings = new String[size];
|
||||||
|
for (int i = 0; i < generatedStrings.length; i++) {
|
||||||
|
generatedStrings[i] = String.valueOf(from + i);
|
||||||
|
}
|
||||||
|
return generatedStrings;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: add tests for the MoreLikeThisQuery
|
// TODO: add tests for the MoreLikeThisQuery
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue