From e8dac9bfdf358fff3b484ed5cd9032c1fe4bae96 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Mon, 28 Mar 2016 10:06:49 +0200 Subject: [PATCH] LUCENE-6954 - keep info about relationship between fields and terms when retrieving terms in MLT --- .../lucene/queries/mlt/MoreLikeThis.java | 127 ++++++++++-------- .../lucene/queries/mlt/TestMoreLikeThis.java | 82 +++++++++++ 2 files changed, 154 insertions(+), 55 deletions(-) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java index 161dddb7f60..ea02af3f8f4 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java @@ -603,11 +603,11 @@ public final class MoreLikeThis { * @return a query that will return docs like the passed Readers. */ public Query like(String fieldName, Reader... readers) throws IOException { - Map words = new HashMap<>(); + Map> perFieldTermFrequencies = new HashMap<>(); for (Reader r : readers) { - addTermFrequencies(r, words, fieldName); + addTermFrequencies(r, perFieldTermFrequencies, fieldName); } - return createQuery(createQueue(words)); + return createQuery(createQueue(perFieldTermFrequencies)); } /** @@ -642,58 +642,65 @@ public final class MoreLikeThis { /** * Create a PriorityQueue from a word->tf map. * - * @param words a map of words keyed on the word(String) with Int objects as the values. + * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int objects as the values. */ - private PriorityQueue createQueue(Map words) throws IOException { + private PriorityQueue createQueue(Map> perFieldTermFrequencies) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); - final int limit = Math.min(maxQueryTerms, words.size()); + final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies)); FreqQ queue = new FreqQ(limit); // will order words by score + for (Map.Entry> entry : perFieldTermFrequencies.entrySet()) { + Map perWordTermFrequencies = entry.getValue(); + String fieldName = entry.getKey(); - for (String word : words.keySet()) { // for every word - int tf = words.get(word).x; // term freq in the source doc - if (minTermFreq > 0 && tf < minTermFreq) { - continue; // filter out words that don't occur enough times in the source - } + for (Map.Entry tfEntry : perWordTermFrequencies.entrySet()) { // for every word + String word = tfEntry.getKey(); + int tf = tfEntry.getValue().x; // term freq in the source doc + if (minTermFreq > 0 && tf < minTermFreq) { + continue; // filter out words that don't occur enough times in the source + } - // go through all the fields and find the largest document frequency - String topField = fieldNames[0]; - int docFreq = 0; - for (String fieldName : fieldNames) { - int freq = ir.docFreq(new Term(fieldName, word)); - topField = (freq > docFreq) ? fieldName : topField; - docFreq = (freq > docFreq) ? freq : docFreq; - } + int docFreq = ir.docFreq(new Term(fieldName, word)); - if (minDocFreq > 0 && docFreq < minDocFreq) { - continue; // filter out words that don't occur in enough docs - } + if (minDocFreq > 0 && docFreq < minDocFreq) { + continue; // filter out words that don't occur in enough docs + } - if (docFreq > maxDocFreq) { - continue; // filter out words that occur in too many docs - } + if (docFreq > maxDocFreq) { + continue; // filter out words that occur in too many docs + } - if (docFreq == 0) { - continue; // index update problem? - } + if (docFreq == 0) { + continue; // index update problem? + } - float idf = similarity.idf(docFreq, numDocs); - float score = tf * idf; + float idf = similarity.idf(docFreq, numDocs); + float score = tf * idf; - if (queue.size() < limit) { - // there is still space in the queue - queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf)); - } else { - ScoreTerm term = queue.top(); - if (term.score < score) { // update the smallest in the queue in place and update the queue. - term.update(word, topField, score, idf, docFreq, tf); - queue.updateTop(); + if (queue.size() < limit) { + // there is still space in the queue + queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf)); + } else { + ScoreTerm term = queue.top(); + if (term.score < score) { // update the smallest in the queue in place and update the queue. + term.update(word, fieldName, score, idf, docFreq, tf); + queue.updateTop(); + } } } } return queue; } + private int getTermsCount(Map> perFieldTermFrequencies) { + int totalTermsCount = 0; + Collection> values = perFieldTermFrequencies.values(); + for (Map perWordTermFrequencies : values) { + totalTermsCount += perWordTermFrequencies.size(); + } + return totalTermsCount; + } + /** * Describe the parameters that control how the "more like this" query is formed. */ @@ -721,7 +728,7 @@ public final class MoreLikeThis { * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue retrieveTerms(int docNum) throws IOException { - Map termFreqMap = new HashMap<>(); + Map> field2termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; @@ -738,43 +745,48 @@ public final class MoreLikeThis { for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { - addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); + addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName); } } } else { - addTermFrequencies(termFreqMap, vector); + addTermFrequencies(field2termFreqMap, vector, fieldName); } } - return createQueue(termFreqMap); + return createQueue(field2termFreqMap); } - private PriorityQueue retrieveTerms(Map> fields) throws + private PriorityQueue retrieveTerms(Map> field2fieldValues) throws IOException { - HashMap termFreqMap = new HashMap<>(); + Map> field2termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { - for (String field : fields.keySet()) { - Collection fieldValues = fields.get(field); + for (String field : field2fieldValues.keySet()) { + Collection fieldValues = field2fieldValues.get(field); if(fieldValues == null) continue; for(Object fieldValue:fieldValues) { if (fieldValue != null) { - addTermFrequencies(new StringReader(String.valueOf(fieldValue)), termFreqMap, + addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap, fieldName); } } } } - return createQueue(termFreqMap); + return createQueue(field2termFreqMap); } /** * Adds terms and frequencies found in vector into the Map termFreqMap * - * @param termFreqMap a Map of terms and their frequencies + * @param field2termFreqMap a Map of terms and their frequencies per field * @param vector List of terms and their frequencies for a doc/field */ - private void addTermFrequencies(Map termFreqMap, Terms vector) throws IOException { + private void addTermFrequencies(Map> field2termFreqMap, Terms vector, String fieldName) throws IOException { + Map termFreqMap = field2termFreqMap.get(fieldName); + if (termFreqMap == null) { + termFreqMap = new HashMap<>(); + field2termFreqMap.put(fieldName, termFreqMap); + } final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; @@ -802,15 +814,20 @@ public final class MoreLikeThis { * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized - * @param termFreqMap a Map of terms and their frequencies + * @param perFieldTermFrequencies a Map of terms and their frequencies per field * @param fieldName Used by analyzer for any special per-field analysis */ - private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName) + private void addTermFrequencies(Reader r, Map> perFieldTermFrequencies, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException("To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } + Map termFreqMap = perFieldTermFrequencies.get(fieldName); + if (termFreqMap == null) { + termFreqMap = new HashMap<>(); + perFieldTermFrequencies.put(fieldName, termFreqMap); + } try (TokenStream ts = analyzer.tokenStream(fieldName, r)) { int tokenCount = 0; // for every token @@ -880,9 +897,9 @@ public final class MoreLikeThis { * @see #retrieveInterestingTerms */ private PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { - Map words = new HashMap<>(); - addTermFrequencies(r, words, fieldName); - return createQueue(words); + Map> field2termFreqMap = new HashMap<>(); + addTermFrequencies(r, field2termFreqMap, fieldName); + return createQueue(field2termFreqMap); } /** diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index 6eb42662c62..5e6466fc061 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -18,6 +18,7 @@ package org.apache.lucene.queries.mlt; import java.io.IOException; import java.io.StringReader; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; @@ -40,8 +41,14 @@ import org.apache.lucene.search.QueryUtils; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; public class TestMoreLikeThis extends LuceneTestCase { + + private static final String SHOP_TYPE = "type"; + private static final String FOR_SALE = "weSell"; + private static final String NOT_FOR_SALE = "weDontSell"; + private Directory directory; private IndexReader reader; private IndexSearcher searcher; @@ -246,5 +253,80 @@ public class TestMoreLikeThis extends LuceneTestCase { return generatedStrings; } + private int addShopDoc(RandomIndexWriter writer, String type, String[] weSell, String[] weDontSell) throws IOException { + Document doc = new Document(); + doc.add(newTextField(SHOP_TYPE, type, Field.Store.YES)); + for (String item : weSell) { + doc.add(newTextField(FOR_SALE, item, Field.Store.YES)); + } + for (String item : weDontSell) { + doc.add(newTextField(NOT_FOR_SALE, item, Field.Store.YES)); + } + writer.addDocument(doc); + return writer.numDocs() - 1; + } + + public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception { + IndexReader reader = null; + Directory dir = newDirectory(); + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + try { + int maxQueryTerms = 25; + + String[] itShopItemForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"}; + String[] itShopItemNotForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"}; + + String[] clothesShopItemForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"}; + String[] clothesShopItemNotForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"}; + + // add series of shop docs + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + for (int i = 0; i < 100; i++) { + addShopDoc(writer, "it", itShopItemForSale, itShopItemNotForSale); + } + for (int i = 0; i < 10; i++) { + addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale); + } + // Input Document is a clothes shop + int inputDocId = addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale); + reader = writer.getReader(); + writer.close(); + + // setup MLT query + MoreLikeThis mlt = new MoreLikeThis(reader); + + mlt.setAnalyzer(analyzer); + mlt.setMaxQueryTerms(maxQueryTerms); + mlt.setMinDocFreq(1); + mlt.setMinTermFreq(1); + mlt.setMinWordLen(1); + mlt.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE}); + + // perform MLT query + BooleanQuery query = (BooleanQuery) mlt.like(inputDocId); + Collection clauses = query.clauses(); + + Collection expectedClothesShopClauses = new ArrayList(); + for (String itemForSale : clothesShopItemForSale) { + BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(FOR_SALE, itemForSale)), BooleanClause.Occur.SHOULD); + expectedClothesShopClauses.add(booleanClause); + } + for (String itemNotForSale : clothesShopItemNotForSale) { + BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(NOT_FOR_SALE, itemNotForSale)), BooleanClause.Occur.SHOULD); + expectedClothesShopClauses.add(booleanClause); + } + + for (BooleanClause expectedClause : expectedClothesShopClauses) { + assertTrue(clauses.contains(expectedClause)); + } + } finally { + // clean up + if (reader != null) { + reader.close(); + } + dir.close(); + analyzer.close(); + } + } // TODO: add tests for the MoreLikeThisQuery }