From e8dac9bfdf358fff3b484ed5cd9032c1fe4bae96 Mon Sep 17 00:00:00 2001
From: Tommaso Teofili <teofili@adobe.com>
Date: Mon, 28 Mar 2016 10:06:49 +0200
Subject: [PATCH] LUCENE-6954 - keep info about relationship between fields and
 terms when retrieving terms in MLT

---
 .../lucene/queries/mlt/MoreLikeThis.java      | 127 ++++++++++--------
 .../lucene/queries/mlt/TestMoreLikeThis.java  |  82 +++++++++++
 2 files changed, 154 insertions(+), 55 deletions(-)
diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
index 161dddb7f60..ea02af3f8f4 100644
--- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
@@ -603,11 +603,11 @@ public final class MoreLikeThis {
    * @return a query that will return docs like the passed Readers.
    */
   public Query like(String fieldName, Reader... readers) throws IOException {
-    Map<String, Int> words = new HashMap<>();
+    Map<String, Map<String, Int>> perFieldTermFrequencies = new HashMap<>();
     for (Reader r : readers) {
-      addTermFrequencies(r, words, fieldName);
+      addTermFrequencies(r, perFieldTermFrequencies, fieldName);
     }
-    return createQuery(createQueue(words));
+    return createQuery(createQueue(perFieldTermFrequencies));
   }
 
   /**
@@ -642,58 +642,65 @@ public final class MoreLikeThis {
   /**
    * Create a PriorityQueue from a word-&gt;tf map.
    *
-   * @param words a map of words keyed on the word(String) with Int objects as the values.
+   * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int objects as the values.
    */
-  private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
+  private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
     // have collected all words in doc and their freqs
     int numDocs = ir.numDocs();
-    final int limit = Math.min(maxQueryTerms, words.size());
+    final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
     FreqQ queue = new FreqQ(limit); // will order words by score
+    for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
+      Map<String, Int> perWordTermFrequencies = entry.getValue();
+      String fieldName = entry.getKey();
 
-    for (String word : words.keySet()) { // for every word
-      int tf = words.get(word).x; // term freq in the source doc
-      if (minTermFreq > 0 && tf < minTermFreq) {
-        continue; // filter out words that don't occur enough times in the source
-      }
+      for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
+        String word = tfEntry.getKey();
+        int tf = tfEntry.getValue().x; // term freq in the source doc
+        if (minTermFreq > 0 && tf < minTermFreq) {
+          continue; // filter out words that don't occur enough times in the source
+        }
 
-      // go through all the fields and find the largest document frequency
-      String topField = fieldNames[0];
-      int docFreq = 0;
-      for (String fieldName : fieldNames) {
-        int freq = ir.docFreq(new Term(fieldName, word));
-        topField = (freq > docFreq) ? fieldName : topField;
-        docFreq = (freq > docFreq) ? freq : docFreq;
-      }
+        int docFreq = ir.docFreq(new Term(fieldName, word));
 
-      if (minDocFreq > 0 && docFreq < minDocFreq) {
-        continue; // filter out words that don't occur in enough docs
-      }
+        if (minDocFreq > 0 && docFreq < minDocFreq) {
+          continue; // filter out words that don't occur in enough docs
+        }
 
-      if (docFreq > maxDocFreq) {
-        continue; // filter out words that occur in too many docs
-      }
+        if (docFreq > maxDocFreq) {
+          continue; // filter out words that occur in too many docs
+        }
 
-      if (docFreq == 0) {
-        continue; // index update problem?
-      }
+        if (docFreq == 0) {
+          continue; // index update problem?
+        }
 
-      float idf = similarity.idf(docFreq, numDocs);
-      float score = tf * idf;
+        float idf = similarity.idf(docFreq, numDocs);
+        float score = tf * idf;
 
-      if (queue.size() < limit) {
-        // there is still space in the queue
-        queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
-      } else {
-        ScoreTerm term = queue.top();
-        if (term.score < score) { // update the smallest in the queue in place and update the queue.
-          term.update(word, topField, score, idf, docFreq, tf);
-          queue.updateTop();
+        if (queue.size() < limit) {
+          // there is still space in the queue
+          queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf));
+        } else {
+          ScoreTerm term = queue.top();
+          if (term.score < score) { // update the smallest in the queue in place and update the queue.
+            term.update(word, fieldName, score, idf, docFreq, tf);
+            queue.updateTop();
+          }
         }
       }
     }
     return queue;
   }
 
+  private int getTermsCount(Map<String, Map<String, Int>> perFieldTermFrequencies) {
+    int totalTermsCount = 0;
+    Collection<Map<String, Int>> values = perFieldTermFrequencies.values();
+    for (Map<String, Int> perWordTermFrequencies : values) {
+      totalTermsCount += perWordTermFrequencies.size();
+    }
+    return totalTermsCount;
+  }
+
   /**
    * Describe the parameters that control how the "more like this" query is formed.
    */
@@ -721,7 +728,7 @@ public final class MoreLikeThis {
    * @param docNum the id of the lucene document from which to find terms
    */
   private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
-    Map<String, Int> termFreqMap = new HashMap<>();
+    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
     for (String fieldName : fieldNames) {
       final Fields vectors = ir.getTermVectors(docNum);
       final Terms vector;
@@ -738,43 +745,48 @@ public final class MoreLikeThis {
         for (IndexableField field : fields) {
           final String stringValue = field.stringValue();
           if (stringValue != null) {
-            addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
+            addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
           }
         }
       } else {
-        addTermFrequencies(termFreqMap, vector);
+        addTermFrequencies(field2termFreqMap, vector, fieldName);
       }
     }
 
-    return createQueue(termFreqMap);
+    return createQueue(field2termFreqMap);
   }
 
 
-  private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> fields) throws 
+  private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues) throws
       IOException {
-    HashMap<String,Int> termFreqMap = new HashMap<>();
+    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
     for (String fieldName : fieldNames) {
-      for (String field : fields.keySet()) {
-        Collection<Object> fieldValues = fields.get(field);
+      for (String field : field2fieldValues.keySet()) {
+        Collection<Object> fieldValues = field2fieldValues.get(field);
         if(fieldValues == null)
           continue;
         for(Object fieldValue:fieldValues) {
           if (fieldValue != null) {
-            addTermFrequencies(new StringReader(String.valueOf(fieldValue)), termFreqMap,
+            addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap,
                 fieldName);
           }
         }
       }
     }
-    return createQueue(termFreqMap);
+    return createQueue(field2termFreqMap);
   }
   /**
    * Adds terms and frequencies found in vector into the Map termFreqMap
    *
-   * @param termFreqMap a Map of terms and their frequencies
+   * @param field2termFreqMap a Map of terms and their frequencies per field
    * @param vector List of terms and their frequencies for a doc/field
    */
-  private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
+  private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
+    Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName);
+    if (termFreqMap == null) {
+      termFreqMap = new HashMap<>();
+      field2termFreqMap.put(fieldName, termFreqMap);
+    }
     final TermsEnum termsEnum = vector.iterator();
     final CharsRefBuilder spare = new CharsRefBuilder();
     BytesRef text;
@@ -802,15 +814,20 @@ public final class MoreLikeThis {
    * Adds term frequencies found by tokenizing text from reader into the Map words
    *
    * @param r a source of text to be tokenized
-   * @param termFreqMap a Map of terms and their frequencies
+   * @param perFieldTermFrequencies a Map of terms and their frequencies per field
    * @param fieldName Used by analyzer for any special per-field analysis
    */
-  private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
+  private void addTermFrequencies(Reader r, Map<String, Map<String, Int>> perFieldTermFrequencies, String fieldName)
       throws IOException {
     if (analyzer == null) {
       throw new UnsupportedOperationException("To use MoreLikeThis without " +
           "term vectors, you must provide an Analyzer");
     }
+    Map<String, Int> termFreqMap = perFieldTermFrequencies.get(fieldName);
+    if (termFreqMap == null) {
+      termFreqMap = new HashMap<>();
+      perFieldTermFrequencies.put(fieldName, termFreqMap);
+    }
     try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
       int tokenCount = 0;
       // for every token
@@ -880,9 +897,9 @@ public final class MoreLikeThis {
    * @see #retrieveInterestingTerms
    */
   private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
-    Map<String, Int> words = new HashMap<>();
-    addTermFrequencies(r, words, fieldName);
-    return createQueue(words);
+    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
+    addTermFrequencies(r, field2termFreqMap, fieldName);
+    return createQueue(field2termFreqMap);
   }
 
   /**
diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
index 6eb42662c62..5e6466fc061 100644
--- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
@@ -18,6 +18,7 @@ package org.apache.lucene.queries.mlt;
 
 import java.io.IOException;
 import java.io.StringReader;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
@@ -40,8 +41,14 @@ import org.apache.lucene.search.QueryUtils;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;
 
 public class TestMoreLikeThis extends LuceneTestCase {
+
+  private static final String SHOP_TYPE = "type";
+  private static final String FOR_SALE = "weSell";
+  private static final String NOT_FOR_SALE = "weDontSell";
+
   private Directory directory;
   private IndexReader reader;
   private IndexSearcher searcher;
@@ -246,5 +253,80 @@ public class TestMoreLikeThis extends LuceneTestCase {
     return generatedStrings;
   }
 
+  private int addShopDoc(RandomIndexWriter writer, String type, String[] weSell, String[] weDontSell) throws IOException {
+    Document doc = new Document();
+    doc.add(newTextField(SHOP_TYPE, type, Field.Store.YES));
+    for (String item : weSell) {
+      doc.add(newTextField(FOR_SALE, item, Field.Store.YES));
+    }
+    for (String item : weDontSell) {
+      doc.add(newTextField(NOT_FOR_SALE, item, Field.Store.YES));
+    }
+    writer.addDocument(doc);
+    return writer.numDocs() - 1;
+  }
+
+  public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception {
+    IndexReader reader = null;
+    Directory dir = newDirectory();
+    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+    try {
+      int maxQueryTerms = 25;
+
+      String[] itShopItemForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"};
+      String[] itShopItemNotForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"};
+
+      String[] clothesShopItemForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"};
+      String[] clothesShopItemNotForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"};
+
+      // add series of shop docs
+      RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+      for (int i = 0; i < 100; i++) {
+        addShopDoc(writer, "it", itShopItemForSale, itShopItemNotForSale);
+      }
+      for (int i = 0; i < 10; i++) {
+        addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
+      }
+      // Input Document is a clothes shop
+      int inputDocId = addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
+      reader = writer.getReader();
+      writer.close();
+
+      // setup MLT query
+      MoreLikeThis mlt = new MoreLikeThis(reader);
+
+      mlt.setAnalyzer(analyzer);
+      mlt.setMaxQueryTerms(maxQueryTerms);
+      mlt.setMinDocFreq(1);
+      mlt.setMinTermFreq(1);
+      mlt.setMinWordLen(1);
+      mlt.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE});
+
+      // perform MLT query
+      BooleanQuery query = (BooleanQuery) mlt.like(inputDocId);
+      Collection<BooleanClause> clauses = query.clauses();
+
+      Collection<BooleanClause> expectedClothesShopClauses = new ArrayList<BooleanClause>();
+      for (String itemForSale : clothesShopItemForSale) {
+        BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(FOR_SALE, itemForSale)), BooleanClause.Occur.SHOULD);
+        expectedClothesShopClauses.add(booleanClause);
+      }
+      for (String itemNotForSale : clothesShopItemNotForSale) {
+        BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(NOT_FOR_SALE, itemNotForSale)), BooleanClause.Occur.SHOULD);
+        expectedClothesShopClauses.add(booleanClause);
+      }
+
+      for (BooleanClause expectedClause : expectedClothesShopClauses) {
+        assertTrue(clauses.contains(expectedClause));
+      }
+    } finally {
+      // clean up
+      if (reader != null) {
+        reader.close();
+      }
+      dir.close();
+      analyzer.close();
+    }
+  }
   // TODO: add tests for the MoreLikeThisQuery
 }