From 173a44e67c7c3c1a9ffbe7259ea8b45f1f53b015 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 10 Jul 2014 15:15:44 +0000 Subject: [PATCH] LUCENE-5795: MoreLikeThisQuery now only collects the top N terms git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1609474 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../lucene/queries/mlt/MoreLikeThis.java | 112 ++++++++++-------- .../lucene/queries/mlt/TestMoreLikeThis.java | 67 ++++++++++- 3 files changed, 135 insertions(+), 48 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 65902e952e5..8b23c72ac4b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -143,6 +143,10 @@ Optimizations to another analyzer, e.g. per field name: PerFieldAnalyzerWrapper and Solr's schema support. (Shay Banon, Uwe Schindler, Robert Muir) +* LUCENE-5795: MoreLikeThisQuery now only collects the top N terms instead + of collecting all terms from the like text when building the query. + (Alex Ksikes, Simon Willnauer) + Bug Fixes * LUCENE-5796: Fixes the Scorer.getChildren() method for two combinations diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java index 0e181e67d3e..db926ec4ad6 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java @@ -604,22 +604,19 @@ public final class MoreLikeThis { /** * Create the More like query from a PriorityQueue */ - private Query createQuery(PriorityQueue q) { + private Query createQuery(PriorityQueue q) { BooleanQuery query = new BooleanQuery(); - Object cur; - int qterms = 0; - float bestScore = 0; + ScoreTerm scoreTerm; + float bestScore = -1; - while ((cur = q.pop()) != null) { - Object[] ar = (Object[]) cur; - TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); + while ((scoreTerm = q.pop()) != null) { + TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word)); if (boost) { - if (qterms == 0) { - bestScore = ((Float) ar[2]); + if (bestScore == -1) { + bestScore = (scoreTerm.score); } - float myScore = ((Float) ar[2]); - + float myScore = (scoreTerm.score); tq.setBoost(boostFactor * myScore / bestScore); } @@ -629,13 +626,7 @@ public final class MoreLikeThis { catch (BooleanQuery.TooManyClauses ignore) { break; } - - qterms++; - if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { - break; - } } - return query; } @@ -644,10 +635,11 @@ public final class MoreLikeThis { * * @param words a map of words keyed on the word(String) with Int objects as the values. */ - private PriorityQueue createQueue(Map words) throws IOException { + private PriorityQueue createQueue(Map words) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); - FreqQ res = new FreqQ(words.size()); // will order words by score + final int limit = Math.min(maxQueryTerms, words.size()); + FreqQ queue = new FreqQ(limit); // will order words by score for (String word : words.keySet()) { // for every word int tf = words.get(word).x; // term freq in the source doc @@ -679,16 +671,18 @@ public final class MoreLikeThis { float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; - // only really need 1st 3 entries, other ones are for troubleshooting - res.insertWithOverflow(new Object[]{word, // the word - topField, // the top field - score, // overall score - idf, // idf - docFreq, // freq in all docs - tf - }); + if (queue.size() < limit) { + // there is still space in the queue + queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf)); + } else { + ScoreTerm term = queue.top(); + if (term.score < score) { // update the smallest in the queue in place and update the queue. + term.update(word, topField, score, idf, docFreq, tf); + queue.updateTop(); + } + } } - return res; + return queue; } /** @@ -717,7 +711,7 @@ public final class MoreLikeThis { * * @param docNum the id of the lucene document from which to find terms */ - public PriorityQueue retrieveTerms(int docNum) throws IOException { + private PriorityQueue retrieveTerms(int docNum) throws IOException { Map termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); @@ -857,7 +851,7 @@ public final class MoreLikeThis { * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first * @see #retrieveInterestingTerms */ - public PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { + private PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { Map words = new HashMap<>(); addTermFrequencies(r, words, fieldName); return createQueue(words); @@ -868,13 +862,12 @@ public final class MoreLikeThis { */ public String[] retrieveInterestingTerms(int docNum) throws IOException { ArrayList al = new ArrayList<>(maxQueryTerms); - PriorityQueue pq = retrieveTerms(docNum); - Object cur; + PriorityQueue pq = retrieveTerms(docNum); + ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words - while (((cur = pq.pop()) != null) && lim-- > 0) { - Object[] ar = (Object[]) cur; - al.add(ar[0]); // the 1st entry is the interesting word + while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { + al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); @@ -892,13 +885,12 @@ public final class MoreLikeThis { */ public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { ArrayList al = new ArrayList<>(maxQueryTerms); - PriorityQueue pq = retrieveTerms(r, fieldName); - Object cur; + PriorityQueue pq = retrieveTerms(r, fieldName); + ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words - while (((cur = pq.pop()) != null) && lim-- > 0) { - Object[] ar = (Object[]) cur; - al.add(ar[0]); // the 1st entry is the interesting word + while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { + al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); @@ -907,16 +899,42 @@ public final class MoreLikeThis { /** * PriorityQueue that orders words by score. */ - private static class FreqQ extends PriorityQueue { - FreqQ(int s) { - super(s); + private static class FreqQ extends PriorityQueue { + FreqQ(int maxSize) { + super(maxSize); } @Override - protected boolean lessThan(Object[] aa, Object[] bb) { - Float fa = (Float) aa[2]; - Float fb = (Float) bb[2]; - return fa > fb; + protected boolean lessThan(ScoreTerm a, ScoreTerm b) { + return a.score < b.score; + } + } + + private static class ScoreTerm { + // only really need 1st 3 entries, other ones are for troubleshooting + String word; + String topField; + float score; + float idf; + int docFreq; + int tf; + + ScoreTerm(String word, String topField, float score, float idf, int docFreq, int tf) { + this.word = word; + this.topField = topField; + this.score = score; + this.idf = idf; + this.docFreq = docFreq; + this.tf = tf; + } + + void update(String word, String topField, float score, float idf, int docFreq, int tf) { + this.word = word; + this.topField = topField; + this.score = score; + this.idf = idf; + this.docFreq = docFreq; + this.tf = tf; } } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index 15488e00f02..1d09e455cab 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -74,7 +74,15 @@ public class TestMoreLikeThis extends LuceneTestCase { doc.add(newTextField("text", text, Field.Store.YES)); writer.addDocument(doc); } - + + private void addDoc(RandomIndexWriter writer, String[] texts) throws IOException { + Document doc = new Document(); + for (String text : texts) { + doc.add(newTextField("text", text, Field.Store.YES)); + } + writer.addDocument(doc); + } + public void testBoostFactor() throws Throwable { Map originalValues = getOriginalValues(); @@ -166,5 +174,62 @@ public class TestMoreLikeThis extends LuceneTestCase { Query query = new MoreLikeThisQuery("this is a test", new String[] { "text" }, new MockAnalyzer(random()), "text"); QueryUtils.check(random(), query, searcher); } + + public void testTopN() throws Exception { + int numDocs = 100; + int topN = 25; + + // add series of docs with terms of decreasing df + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + for (int i = 0; i < numDocs; i++) { + addDoc(writer, generateStrSeq(0, i + 1)); + } + IndexReader reader = writer.getReader(); + writer.shutdown(); + + // setup MLT query + MoreLikeThis mlt = new MoreLikeThis(reader); + mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)); + mlt.setMaxQueryTerms(topN); + mlt.setMinDocFreq(1); + mlt.setMinTermFreq(1); + mlt.setMinWordLen(1); + mlt.setFieldNames(new String[]{"text"}); + + // perform MLT query + String likeText = ""; + for (String text : generateStrSeq(0, numDocs)) { + likeText += text + " "; + } + BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText)); + + // check best terms are topN of highest idf + List clauses = query.clauses(); + assertEquals("Expected" + topN + "clauses only!", topN, clauses.size()); + + Term[] expectedTerms = new Term[topN]; + int idx = 0; + for (String text : generateStrSeq(numDocs - topN, topN)) { + expectedTerms[idx++] = new Term("text", text); + } + for (BooleanClause clause : clauses) { + Term term = ((TermQuery) clause.getQuery()).getTerm(); + assertTrue(Arrays.asList(expectedTerms).contains(term)); + } + + // clean up + reader.close(); + dir.close(); + } + + private String[] generateStrSeq(int from, int size) { + String[] generatedStrings = new String[size]; + for (int i = 0; i < generatedStrings.length; i++) { + generatedStrings[i] = String.valueOf(from + i); + } + return generatedStrings; + } + // TODO: add tests for the MoreLikeThisQuery }