LUCENE-6954 - keep info about relationship between fields and terms when retrieving terms in MLT

This commit is contained in:
Tommaso Teofili 2016-03-28 10:06:49 +02:00
parent 6f0488f730
commit e8dac9bfdf
2 changed files with 154 additions and 55 deletions

View File

@ -603,11 +603,11 @@ public final class MoreLikeThis {
* @return a query that will return docs like the passed Readers.
*/
public Query like(String fieldName, Reader... readers) throws IOException {
Map<String, Int> words = new HashMap<>();
Map<String, Map<String, Int>> perFieldTermFrequencies = new HashMap<>();
for (Reader r : readers) {
addTermFrequencies(r, words, fieldName);
addTermFrequencies(r, perFieldTermFrequencies, fieldName);
}
return createQuery(createQueue(words));
return createQuery(createQueue(perFieldTermFrequencies));
}
/**
@ -642,58 +642,65 @@ public final class MoreLikeThis {
/**
* Create a PriorityQueue from a word-&gt;tf map.
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
* @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int objects as the values.
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
final int limit = Math.min(maxQueryTerms, words.size());
final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
FreqQ queue = new FreqQ(limit); // will order words by score
for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
Map<String, Int> perWordTermFrequencies = entry.getValue();
String fieldName = entry.getKey();
for (String word : words.keySet()) { // for every word
int tf = words.get(word).x; // term freq in the source doc
if (minTermFreq > 0 && tf < minTermFreq) {
continue; // filter out words that don't occur enough times in the source
}
for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
String word = tfEntry.getKey();
int tf = tfEntry.getValue().x; // term freq in the source doc
if (minTermFreq > 0 && tf < minTermFreq) {
continue; // filter out words that don't occur enough times in the source
}
// go through all the fields and find the largest document frequency
String topField = fieldNames[0];
int docFreq = 0;
for (String fieldName : fieldNames) {
int freq = ir.docFreq(new Term(fieldName, word));
topField = (freq > docFreq) ? fieldName : topField;
docFreq = (freq > docFreq) ? freq : docFreq;
}
int docFreq = ir.docFreq(new Term(fieldName, word));
if (minDocFreq > 0 && docFreq < minDocFreq) {
continue; // filter out words that don't occur in enough docs
}
if (minDocFreq > 0 && docFreq < minDocFreq) {
continue; // filter out words that don't occur in enough docs
}
if (docFreq > maxDocFreq) {
continue; // filter out words that occur in too many docs
}
if (docFreq > maxDocFreq) {
continue; // filter out words that occur in too many docs
}
if (docFreq == 0) {
continue; // index update problem?
}
if (docFreq == 0) {
continue; // index update problem?
}
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
if (queue.size() < limit) {
// there is still space in the queue
queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
} else {
ScoreTerm term = queue.top();
if (term.score < score) { // update the smallest in the queue in place and update the queue.
term.update(word, topField, score, idf, docFreq, tf);
queue.updateTop();
if (queue.size() < limit) {
// there is still space in the queue
queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf));
} else {
ScoreTerm term = queue.top();
if (term.score < score) { // update the smallest in the queue in place and update the queue.
term.update(word, fieldName, score, idf, docFreq, tf);
queue.updateTop();
}
}
}
}
return queue;
}
private int getTermsCount(Map<String, Map<String, Int>> perFieldTermFrequencies) {
int totalTermsCount = 0;
Collection<Map<String, Int>> values = perFieldTermFrequencies.values();
for (Map<String, Int> perWordTermFrequencies : values) {
totalTermsCount += perWordTermFrequencies.size();
}
return totalTermsCount;
}
/**
* Describe the parameters that control how the "more like this" query is formed.
*/
@ -721,7 +728,7 @@ public final class MoreLikeThis {
* @param docNum the id of the lucene document from which to find terms
*/
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
Map<String, Int> termFreqMap = new HashMap<>();
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
final Fields vectors = ir.getTermVectors(docNum);
final Terms vector;
@ -738,43 +745,48 @@ public final class MoreLikeThis {
for (IndexableField field : fields) {
final String stringValue = field.stringValue();
if (stringValue != null) {
addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
}
}
} else {
addTermFrequencies(termFreqMap, vector);
addTermFrequencies(field2termFreqMap, vector, fieldName);
}
}
return createQueue(termFreqMap);
return createQueue(field2termFreqMap);
}
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> fields) throws
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues) throws
IOException {
HashMap<String,Int> termFreqMap = new HashMap<>();
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
for (String fieldName : fieldNames) {
for (String field : fields.keySet()) {
Collection<Object> fieldValues = fields.get(field);
for (String field : field2fieldValues.keySet()) {
Collection<Object> fieldValues = field2fieldValues.get(field);
if(fieldValues == null)
continue;
for(Object fieldValue:fieldValues) {
if (fieldValue != null) {
addTermFrequencies(new StringReader(String.valueOf(fieldValue)), termFreqMap,
addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap,
fieldName);
}
}
}
}
return createQueue(termFreqMap);
return createQueue(field2termFreqMap);
}
/**
* Adds terms and frequencies found in vector into the Map termFreqMap
*
* @param termFreqMap a Map of terms and their frequencies
* @param field2termFreqMap a Map of terms and their frequencies per field
* @param vector List of terms and their frequencies for a doc/field
*/
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName);
if (termFreqMap == null) {
termFreqMap = new HashMap<>();
field2termFreqMap.put(fieldName, termFreqMap);
}
final TermsEnum termsEnum = vector.iterator();
final CharsRefBuilder spare = new CharsRefBuilder();
BytesRef text;
@ -802,15 +814,20 @@ public final class MoreLikeThis {
* Adds term frequencies found by tokenizing text from reader into the Map words
*
* @param r a source of text to be tokenized
* @param termFreqMap a Map of terms and their frequencies
* @param perFieldTermFrequencies a Map of terms and their frequencies per field
* @param fieldName Used by analyzer for any special per-field analysis
*/
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
private void addTermFrequencies(Reader r, Map<String, Map<String, Int>> perFieldTermFrequencies, String fieldName)
throws IOException {
if (analyzer == null) {
throw new UnsupportedOperationException("To use MoreLikeThis without " +
"term vectors, you must provide an Analyzer");
}
Map<String, Int> termFreqMap = perFieldTermFrequencies.get(fieldName);
if (termFreqMap == null) {
termFreqMap = new HashMap<>();
perFieldTermFrequencies.put(fieldName, termFreqMap);
}
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
int tokenCount = 0;
// for every token
@ -880,9 +897,9 @@ public final class MoreLikeThis {
* @see #retrieveInterestingTerms
*/
private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
Map<String, Int> words = new HashMap<>();
addTermFrequencies(r, words, fieldName);
return createQueue(words);
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
addTermFrequencies(r, field2termFreqMap, fieldName);
return createQueue(field2termFreqMap);
}
/**

View File

@ -18,6 +18,7 @@ package org.apache.lucene.queries.mlt;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
@ -40,8 +41,14 @@ import org.apache.lucene.search.QueryUtils;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;
public class TestMoreLikeThis extends LuceneTestCase {
private static final String SHOP_TYPE = "type";
private static final String FOR_SALE = "weSell";
private static final String NOT_FOR_SALE = "weDontSell";
private Directory directory;
private IndexReader reader;
private IndexSearcher searcher;
@ -246,5 +253,80 @@ public class TestMoreLikeThis extends LuceneTestCase {
return generatedStrings;
}
private int addShopDoc(RandomIndexWriter writer, String type, String[] weSell, String[] weDontSell) throws IOException {
Document doc = new Document();
doc.add(newTextField(SHOP_TYPE, type, Field.Store.YES));
for (String item : weSell) {
doc.add(newTextField(FOR_SALE, item, Field.Store.YES));
}
for (String item : weDontSell) {
doc.add(newTextField(NOT_FOR_SALE, item, Field.Store.YES));
}
writer.addDocument(doc);
return writer.numDocs() - 1;
}
public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception {
IndexReader reader = null;
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
try {
int maxQueryTerms = 25;
String[] itShopItemForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"};
String[] itShopItemNotForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"};
String[] clothesShopItemForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"};
String[] clothesShopItemNotForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"};
// add series of shop docs
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
for (int i = 0; i < 100; i++) {
addShopDoc(writer, "it", itShopItemForSale, itShopItemNotForSale);
}
for (int i = 0; i < 10; i++) {
addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
}
// Input Document is a clothes shop
int inputDocId = addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
reader = writer.getReader();
writer.close();
// setup MLT query
MoreLikeThis mlt = new MoreLikeThis(reader);
mlt.setAnalyzer(analyzer);
mlt.setMaxQueryTerms(maxQueryTerms);
mlt.setMinDocFreq(1);
mlt.setMinTermFreq(1);
mlt.setMinWordLen(1);
mlt.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE});
// perform MLT query
BooleanQuery query = (BooleanQuery) mlt.like(inputDocId);
Collection<BooleanClause> clauses = query.clauses();
Collection<BooleanClause> expectedClothesShopClauses = new ArrayList<BooleanClause>();
for (String itemForSale : clothesShopItemForSale) {
BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(FOR_SALE, itemForSale)), BooleanClause.Occur.SHOULD);
expectedClothesShopClauses.add(booleanClause);
}
for (String itemNotForSale : clothesShopItemNotForSale) {
BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(NOT_FOR_SALE, itemNotForSale)), BooleanClause.Occur.SHOULD);
expectedClothesShopClauses.add(booleanClause);
}
for (BooleanClause expectedClause : expectedClothesShopClauses) {
assertTrue(clauses.contains(expectedClause));
}
} finally {
// clean up
if (reader != null) {
reader.close();
}
dir.close();
analyzer.close();
}
}
// TODO: add tests for the MoreLikeThisQuery
}