LUCENE-6954 - keep info about relationship between fields and terms when retrieving terms in MLT

2016-03-28 10:06:49 +02:00 · 2016-03-28 10:06:49 +02:00 · e8dac9bfdf
parent 6f0488f730
commit e8dac9bfdf
2 changed files with 154 additions and 55 deletions
--- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
@ -603,11 +603,11 @@ public final class MoreLikeThis {
   * @return a query that will return docs like the passed Readers.
   */
  public Query like(String fieldName, Reader... readers) throws IOException {
-    Map<String, Int> words = new HashMap<>();
+    Map<String, Map<String, Int>> perFieldTermFrequencies = new HashMap<>();
    for (Reader r : readers) {
-      addTermFrequencies(r, words, fieldName);
+      addTermFrequencies(r, perFieldTermFrequencies, fieldName);
    }
-    return createQuery(createQueue(words));
+    return createQuery(createQueue(perFieldTermFrequencies));
  }

  /**
@ -642,58 +642,65 @@ public final class MoreLikeThis {
  /**
   * Create a PriorityQueue from a word-&gt;tf map.
   *
-   * @param words a map of words keyed on the word(String) with Int objects as the values.
+   * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int objects as the values.
   */
-  private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
+  private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
    // have collected all words in doc and their freqs
    int numDocs = ir.numDocs();
-    final int limit = Math.min(maxQueryTerms, words.size());
+    final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
    FreqQ queue = new FreqQ(limit); // will order words by score
+    for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
+      Map<String, Int> perWordTermFrequencies = entry.getValue();
+      String fieldName = entry.getKey();

-    for (String word : words.keySet()) { // for every word
-      int tf = words.get(word).x; // term freq in the source doc
-      if (minTermFreq > 0 && tf < minTermFreq) {
-        continue; // filter out words that don't occur enough times in the source
-      }
+      for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
+        String word = tfEntry.getKey();
+        int tf = tfEntry.getValue().x; // term freq in the source doc
+        if (minTermFreq > 0 && tf < minTermFreq) {
+          continue; // filter out words that don't occur enough times in the source
+        }

-      // go through all the fields and find the largest document frequency
-      String topField = fieldNames[0];
-      int docFreq = 0;
-      for (String fieldName : fieldNames) {
-        int freq = ir.docFreq(new Term(fieldName, word));
-        topField = (freq > docFreq) ? fieldName : topField;
-        docFreq = (freq > docFreq) ? freq : docFreq;
-      }
+        int docFreq = ir.docFreq(new Term(fieldName, word));

-      if (minDocFreq > 0 && docFreq < minDocFreq) {
-        continue; // filter out words that don't occur in enough docs
-      }
+        if (minDocFreq > 0 && docFreq < minDocFreq) {
+          continue; // filter out words that don't occur in enough docs
+        }

-      if (docFreq > maxDocFreq) {
-        continue; // filter out words that occur in too many docs
-      }
+        if (docFreq > maxDocFreq) {
+          continue; // filter out words that occur in too many docs
+        }

-      if (docFreq == 0) {
-        continue; // index update problem?
-      }
+        if (docFreq == 0) {
+          continue; // index update problem?
+        }

-      float idf = similarity.idf(docFreq, numDocs);
-      float score = tf * idf;
+        float idf = similarity.idf(docFreq, numDocs);
+        float score = tf * idf;

-      if (queue.size() < limit) {
-        // there is still space in the queue
-        queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
-      } else {
-        ScoreTerm term = queue.top();
-        if (term.score < score) { // update the smallest in the queue in place and update the queue.
-          term.update(word, topField, score, idf, docFreq, tf);
-          queue.updateTop();
+        if (queue.size() < limit) {
+          // there is still space in the queue
+          queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf));
+        } else {
+          ScoreTerm term = queue.top();
+          if (term.score < score) { // update the smallest in the queue in place and update the queue.
+            term.update(word, fieldName, score, idf, docFreq, tf);
+            queue.updateTop();
+          }
        }
      }
    }
    return queue;
  }

+  private int getTermsCount(Map<String, Map<String, Int>> perFieldTermFrequencies) {
+    int totalTermsCount = 0;
+    Collection<Map<String, Int>> values = perFieldTermFrequencies.values();
+    for (Map<String, Int> perWordTermFrequencies : values) {
+      totalTermsCount += perWordTermFrequencies.size();
+    }
+    return totalTermsCount;
+  }
+
  /**
   * Describe the parameters that control how the "more like this" query is formed.
   */
@ -721,7 +728,7 @@ public final class MoreLikeThis {
   * @param docNum the id of the lucene document from which to find terms
   */
  private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
-    Map<String, Int> termFreqMap = new HashMap<>();
+    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
      final Fields vectors = ir.getTermVectors(docNum);
      final Terms vector;
@ -738,43 +745,48 @@ public final class MoreLikeThis {
        for (IndexableField field : fields) {
          final String stringValue = field.stringValue();
          if (stringValue != null) {
-            addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
+            addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
          }
        }
      } else {
-        addTermFrequencies(termFreqMap, vector);
+        addTermFrequencies(field2termFreqMap, vector, fieldName);
      }
    }

-    return createQueue(termFreqMap);
+    return createQueue(field2termFreqMap);
  }


-  private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> fields) throws 
+  private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues) throws
      IOException {
-    HashMap<String,Int> termFreqMap = new HashMap<>();
+    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
-      for (String field : fields.keySet()) {
-        Collection<Object> fieldValues = fields.get(field);
+      for (String field : field2fieldValues.keySet()) {
+        Collection<Object> fieldValues = field2fieldValues.get(field);
        if(fieldValues == null)
          continue;
        for(Object fieldValue:fieldValues) {
          if (fieldValue != null) {
-            addTermFrequencies(new StringReader(String.valueOf(fieldValue)), termFreqMap,
+            addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap,
                fieldName);
          }
        }
      }
    }
-    return createQueue(termFreqMap);
+    return createQueue(field2termFreqMap);
  }
  /**
   * Adds terms and frequencies found in vector into the Map termFreqMap
   *
-   * @param termFreqMap a Map of terms and their frequencies
+   * @param field2termFreqMap a Map of terms and their frequencies per field
   * @param vector List of terms and their frequencies for a doc/field
   */
-  private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
+  private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
+    Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName);
+    if (termFreqMap == null) {
+      termFreqMap = new HashMap<>();
+      field2termFreqMap.put(fieldName, termFreqMap);
+    }
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
@ -802,15 +814,20 @@ public final class MoreLikeThis {
   * Adds term frequencies found by tokenizing text from reader into the Map words
   *
   * @param r a source of text to be tokenized
-   * @param termFreqMap a Map of terms and their frequencies
+   * @param perFieldTermFrequencies a Map of terms and their frequencies per field
   * @param fieldName Used by analyzer for any special per-field analysis
   */
-  private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
+  private void addTermFrequencies(Reader r, Map<String, Map<String, Int>> perFieldTermFrequencies, String fieldName)
      throws IOException {
    if (analyzer == null) {
      throw new UnsupportedOperationException("To use MoreLikeThis without " +
          "term vectors, you must provide an Analyzer");
    }
+    Map<String, Int> termFreqMap = perFieldTermFrequencies.get(fieldName);
+    if (termFreqMap == null) {
+      termFreqMap = new HashMap<>();
+      perFieldTermFrequencies.put(fieldName, termFreqMap);
+    }
    try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
      int tokenCount = 0;
      // for every token
@ -880,9 +897,9 @@ public final class MoreLikeThis {
   * @see #retrieveInterestingTerms
   */
  private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
-    Map<String, Int> words = new HashMap<>();
-    addTermFrequencies(r, words, fieldName);
-    return createQueue(words);
+    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
+    addTermFrequencies(r, field2termFreqMap, fieldName);
+    return createQueue(field2termFreqMap);
  }

  /**
--- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
+++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java
@ -18,6 +18,7 @@ package org.apache.lucene.queries.mlt;

 import java.io.IOException;
 import java.io.StringReader;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
@ -40,8 +41,14 @@ import org.apache.lucene.search.QueryUtils;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.LuceneTestCase;
+import org.junit.Test;

 public class TestMoreLikeThis extends LuceneTestCase {
+
+  private static final String SHOP_TYPE = "type";
+  private static final String FOR_SALE = "weSell";
+  private static final String NOT_FOR_SALE = "weDontSell";
+
  private Directory directory;
  private IndexReader reader;
  private IndexSearcher searcher;
@ -246,5 +253,80 @@ public class TestMoreLikeThis extends LuceneTestCase {
    return generatedStrings;
  }

+  private int addShopDoc(RandomIndexWriter writer, String type, String[] weSell, String[] weDontSell) throws IOException {
+    Document doc = new Document();
+    doc.add(newTextField(SHOP_TYPE, type, Field.Store.YES));
+    for (String item : weSell) {
+      doc.add(newTextField(FOR_SALE, item, Field.Store.YES));
+    }
+    for (String item : weDontSell) {
+      doc.add(newTextField(NOT_FOR_SALE, item, Field.Store.YES));
+    }
+    writer.addDocument(doc);
+    return writer.numDocs() - 1;
+  }
+
+  public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception {
+    IndexReader reader = null;
+    Directory dir = newDirectory();
+    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
+    try {
+      int maxQueryTerms = 25;
+
+      String[] itShopItemForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"};
+      String[] itShopItemNotForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"};
+
+      String[] clothesShopItemForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"};
+      String[] clothesShopItemNotForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"};
+
+      // add series of shop docs
+      RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
+      for (int i = 0; i < 100; i++) {
+        addShopDoc(writer, "it", itShopItemForSale, itShopItemNotForSale);
+      }
+      for (int i = 0; i < 10; i++) {
+        addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
+      }
+      // Input Document is a clothes shop
+      int inputDocId = addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
+      reader = writer.getReader();
+      writer.close();
+
+      // setup MLT query
+      MoreLikeThis mlt = new MoreLikeThis(reader);
+
+      mlt.setAnalyzer(analyzer);
+      mlt.setMaxQueryTerms(maxQueryTerms);
+      mlt.setMinDocFreq(1);
+      mlt.setMinTermFreq(1);
+      mlt.setMinWordLen(1);
+      mlt.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE});
+
+      // perform MLT query
+      BooleanQuery query = (BooleanQuery) mlt.like(inputDocId);
+      Collection<BooleanClause> clauses = query.clauses();
+
+      Collection<BooleanClause> expectedClothesShopClauses = new ArrayList<BooleanClause>();
+      for (String itemForSale : clothesShopItemForSale) {
+        BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(FOR_SALE, itemForSale)), BooleanClause.Occur.SHOULD);
+        expectedClothesShopClauses.add(booleanClause);
+      }
+      for (String itemNotForSale : clothesShopItemNotForSale) {
+        BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(NOT_FOR_SALE, itemNotForSale)), BooleanClause.Occur.SHOULD);
+        expectedClothesShopClauses.add(booleanClause);
+      }
+
+      for (BooleanClause expectedClause : expectedClothesShopClauses) {
+        assertTrue(clauses.contains(expectedClause));
+      }
+    } finally {
+      // clean up
+      if (reader != null) {
+        reader.close();
+      }
+      dir.close();
+      analyzer.close();
+    }
+  }
  // TODO: add tests for the MoreLikeThisQuery
 }