From 173a44e67c7c3c1a9ffbe7259ea8b45f1f53b015 Mon Sep 17 00:00:00 2001
From: Simon Willnauer <simonw@apache.org>
Date: Thu, 10 Jul 2014 15:15:44 +0000
Subject: [PATCH] LUCENE-5795:  MoreLikeThisQuery now only collects the top N
 terms

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1609474 13f79535-47bb-0310-9956-ffa450edef68
---
 lucene/CHANGES.txt                            |   4 +
 .../lucene/queries/mlt/MoreLikeThis.java      | 112 ++++++++++--------
 .../lucene/queries/mlt/TestMoreLikeThis.java  |  67 ++++++++++-
 3 files changed, 135 insertions(+), 48 deletions(-)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 65902e952e5..8b23c72ac4b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -143,6 +143,10 @@ Optimizations
   to another analyzer, e.g. per field name: PerFieldAnalyzerWrapper and
   Solr's schema support.  (Shay Banon, Uwe Schindler, Robert Muir)
 
+* LUCENE-5795: MoreLikeThisQuery now only collects the top N terms instead
+  of collecting all terms from the like text when building the query. 
+  (Alex Ksikes, Simon Willnauer)
+
 Bug Fixes
 
 * LUCENE-5796: Fixes the Scorer.getChildren() method for two combinations 
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
index 0e181e67d3e..db926ec4ad6 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
@@ -604,22 +604,19 @@ public final class MoreLikeThis {
   /**
    * Create the More like query from a PriorityQueue
    */
-  private Query createQuery(PriorityQueue<Object[]> q) {
+  private Query createQuery(PriorityQueue<ScoreTerm> q) {
     BooleanQuery query = new BooleanQuery();
-    Object cur;
-    int qterms = 0;
-    float bestScore = 0;
+    ScoreTerm scoreTerm;
+    float bestScore = -1;
 
-    while ((cur = q.pop()) != null) {
-      Object[] ar = (Object[]) cur;
-      TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
+    while ((scoreTerm = q.pop()) != null) {
+      TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word));
 
       if (boost) {
-        if (qterms == 0) {
-          bestScore = ((Float) ar[2]);
+        if (bestScore == -1) {
+          bestScore = (scoreTerm.score);
         }
-        float myScore = ((Float) ar[2]);
-
+        float myScore = (scoreTerm.score);
         tq.setBoost(boostFactor * myScore / bestScore);
       }
 
@@ -629,13 +626,7 @@ public final class MoreLikeThis {
       catch (BooleanQuery.TooManyClauses ignore) {
         break;
       }
-
-      qterms++;
-      if (maxQueryTerms > 0 && qterms >= maxQueryTerms) {
-        break;
-      }
     }
-
     return query;
   }
 
@@ -644,10 +635,11 @@ public final class MoreLikeThis {
    *
    * @param words a map of words keyed on the word(String) with Int objects as the values.
    */
-  private PriorityQueue<Object[]> createQueue(Map<String, Int> words) throws IOException {
+  private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
     // have collected all words in doc and their freqs
     int numDocs = ir.numDocs();
-    FreqQ res = new FreqQ(words.size()); // will order words by score
+    final int limit = Math.min(maxQueryTerms, words.size());
+    FreqQ queue = new FreqQ(limit); // will order words by score
 
     for (String word : words.keySet()) { // for every word
       int tf = words.get(word).x; // term freq in the source doc
@@ -679,16 +671,18 @@ public final class MoreLikeThis {
       float idf = similarity.idf(docFreq, numDocs);
       float score = tf * idf;
 
-      // only really need 1st 3 entries, other ones are for troubleshooting
-      res.insertWithOverflow(new Object[]{word,                   // the word
-          topField,               // the top field
-          score,       // overall score
-          idf,         // idf
-          docFreq,   // freq in all docs
-          tf
-      });
+      if (queue.size() < limit) {
+        // there is still space in the queue
+        queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
+      } else {
+        ScoreTerm term = queue.top();
+        if (term.score < score) { // update the smallest in the queue in place and update the queue.
+          term.update(word, topField, score, idf, docFreq, tf);
+          queue.updateTop();
+        }
+      }
     }
-    return res;
+    return queue;
   }
 
   /**
@@ -717,7 +711,7 @@ public final class MoreLikeThis {
    *
    * @param docNum the id of the lucene document from which to find terms
    */
-  public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
+  private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
     Map<String, Int> termFreqMap = new HashMap<>();
     for (String fieldName : fieldNames) {
       final Fields vectors = ir.getTermVectors(docNum);
@@ -857,7 +851,7 @@ public final class MoreLikeThis {
    * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
    * @see #retrieveInterestingTerms
    */
-  public PriorityQueue<Object[]> retrieveTerms(Reader r, String fieldName) throws IOException {
+  private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
     Map<String, Int> words = new HashMap<>();
     addTermFrequencies(r, words, fieldName);
     return createQueue(words);
@@ -868,13 +862,12 @@ public final class MoreLikeThis {
    */
   public String[] retrieveInterestingTerms(int docNum) throws IOException {
     ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
-    PriorityQueue<Object[]> pq = retrieveTerms(docNum);
-    Object cur;
+    PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum);
+    ScoreTerm scoreTerm;
     int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
     // we just want to return the top words
-    while (((cur = pq.pop()) != null) && lim-- > 0) {
-      Object[] ar = (Object[]) cur;
-      al.add(ar[0]); // the 1st entry is the interesting word
+    while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
+      al.add(scoreTerm.word); // the 1st entry is the interesting word
     }
     String[] res = new String[al.size()];
     return al.toArray(res);
@@ -892,13 +885,12 @@ public final class MoreLikeThis {
    */
   public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
     ArrayList<Object> al = new ArrayList<>(maxQueryTerms);
-    PriorityQueue<Object[]> pq = retrieveTerms(r, fieldName);
-    Object cur;
+    PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName);
+    ScoreTerm scoreTerm;
     int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
     // we just want to return the top words
-    while (((cur = pq.pop()) != null) && lim-- > 0) {
-      Object[] ar = (Object[]) cur;
-      al.add(ar[0]); // the 1st entry is the interesting word
+    while (((scoreTerm = pq.pop()) != null) && lim-- > 0) {
+      al.add(scoreTerm.word); // the 1st entry is the interesting word
     }
     String[] res = new String[al.size()];
     return al.toArray(res);
@@ -907,16 +899,42 @@ public final class MoreLikeThis {
   /**
    * PriorityQueue that orders words by score.
    */
-  private static class FreqQ extends PriorityQueue<Object[]> {
-    FreqQ(int s) {
-      super(s);
+  private static class FreqQ extends PriorityQueue<ScoreTerm> {
+    FreqQ(int maxSize) {
+      super(maxSize);
     }
 
     @Override
-    protected boolean lessThan(Object[] aa, Object[] bb) {
-      Float fa = (Float) aa[2];
-      Float fb = (Float) bb[2];
-      return fa > fb;
+    protected boolean lessThan(ScoreTerm a, ScoreTerm b) {
+      return a.score < b.score;
+    }
+  }
+
+  private static class ScoreTerm {
+    // only really need 1st 3 entries, other ones are for troubleshooting
+    String word;
+    String topField;
+    float score;
+    float idf;
+    int docFreq;
+    int tf;
+
+    ScoreTerm(String word, String topField, float score, float idf, int docFreq, int tf) {
+      this.word = word;
+      this.topField = topField;
+      this.score = score;
+      this.idf = idf;
+      this.docFreq = docFreq;
+      this.tf = tf;
+    }
+
+    void update(String word, String topField, float score, float idf, int docFreq, int tf) {
+      this.word = word;
+      this.topField = topField;
+      this.score = score;
+      this.idf = idf;
+      this.docFreq = docFreq;
+      this.tf = tf;
     }
   }
 
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
index 15488e00f02..1d09e455cab 100644
--- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
@@ -74,7 +74,15 @@ public class TestMoreLikeThis extends LuceneTestCase {
     doc.add(newTextField("text", text, Field.Store.YES));
     writer.addDocument(doc);
   }
-  
+
+  private void addDoc(RandomIndexWriter writer, String[] texts) throws IOException {
+    Document doc = new Document();
+    for (String text : texts) {
+      doc.add(newTextField("text", text, Field.Store.YES));
+    }
+    writer.addDocument(doc);
+  }
+
   public void testBoostFactor() throws Throwable {
     Map<String,Float> originalValues = getOriginalValues();
     
@@ -166,5 +174,62 @@ public class TestMoreLikeThis extends LuceneTestCase {
     Query query = new MoreLikeThisQuery("this is a test", new String[] { "text" }, new MockAnalyzer(random()), "text");
     QueryUtils.check(random(), query, searcher);
   }
+
+  public void testTopN() throws Exception {
+    int numDocs = 100;
+    int topN = 25;
+
+    // add series of docs with terms of decreasing df
+    Directory dir = newDirectory();
+    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+    for (int i = 0; i < numDocs; i++) {
+      addDoc(writer, generateStrSeq(0, i + 1));
+    }
+    IndexReader reader = writer.getReader();
+    writer.shutdown();
+
+    // setup MLT query
+    MoreLikeThis mlt = new MoreLikeThis(reader);
+    mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
+    mlt.setMaxQueryTerms(topN);
+    mlt.setMinDocFreq(1);
+    mlt.setMinTermFreq(1);
+    mlt.setMinWordLen(1);
+    mlt.setFieldNames(new String[]{"text"});
+
+    // perform MLT query
+    String likeText = "";
+    for (String text : generateStrSeq(0, numDocs)) {
+      likeText += text + " ";
+    }
+    BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText));
+
+    // check best terms are topN of highest idf
+    List<BooleanClause> clauses = query.clauses();
+    assertEquals("Expected" + topN + "clauses only!", topN, clauses.size());
+
+    Term[] expectedTerms = new Term[topN];
+    int idx = 0;
+    for (String text : generateStrSeq(numDocs - topN, topN)) {
+      expectedTerms[idx++] = new Term("text", text);
+    }
+    for (BooleanClause clause : clauses) {
+      Term term = ((TermQuery) clause.getQuery()).getTerm();
+      assertTrue(Arrays.asList(expectedTerms).contains(term));
+    }
+
+    // clean up
+    reader.close();
+    dir.close();
+  }
+
+  private String[] generateStrSeq(int from, int size) {
+    String[] generatedStrings = new String[size];
+    for (int i = 0; i < generatedStrings.length; i++) {
+      generatedStrings[i] = String.valueOf(from + i);
+    }
+    return generatedStrings;
+  }
+
   // TODO: add tests for the MoreLikeThisQuery
 }