mirror of https://github.com/apache/lucene.git
LUCENE-6954 - keep info about relationship between fields and terms when retrieving terms in MLT
This commit is contained in:
parent
6f0488f730
commit
e8dac9bfdf
|
@ -603,11 +603,11 @@ public final class MoreLikeThis {
|
|||
* @return a query that will return docs like the passed Readers.
|
||||
*/
|
||||
public Query like(String fieldName, Reader... readers) throws IOException {
|
||||
Map<String, Int> words = new HashMap<>();
|
||||
Map<String, Map<String, Int>> perFieldTermFrequencies = new HashMap<>();
|
||||
for (Reader r : readers) {
|
||||
addTermFrequencies(r, words, fieldName);
|
||||
addTermFrequencies(r, perFieldTermFrequencies, fieldName);
|
||||
}
|
||||
return createQuery(createQueue(words));
|
||||
return createQuery(createQueue(perFieldTermFrequencies));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -642,58 +642,65 @@ public final class MoreLikeThis {
|
|||
/**
|
||||
* Create a PriorityQueue from a word->tf map.
|
||||
*
|
||||
* @param words a map of words keyed on the word(String) with Int objects as the values.
|
||||
* @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int objects as the values.
|
||||
*/
|
||||
private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException {
|
||||
private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
|
||||
// have collected all words in doc and their freqs
|
||||
int numDocs = ir.numDocs();
|
||||
final int limit = Math.min(maxQueryTerms, words.size());
|
||||
final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
|
||||
FreqQ queue = new FreqQ(limit); // will order words by score
|
||||
for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
|
||||
Map<String, Int> perWordTermFrequencies = entry.getValue();
|
||||
String fieldName = entry.getKey();
|
||||
|
||||
for (String word : words.keySet()) { // for every word
|
||||
int tf = words.get(word).x; // term freq in the source doc
|
||||
if (minTermFreq > 0 && tf < minTermFreq) {
|
||||
continue; // filter out words that don't occur enough times in the source
|
||||
}
|
||||
for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
|
||||
String word = tfEntry.getKey();
|
||||
int tf = tfEntry.getValue().x; // term freq in the source doc
|
||||
if (minTermFreq > 0 && tf < minTermFreq) {
|
||||
continue; // filter out words that don't occur enough times in the source
|
||||
}
|
||||
|
||||
// go through all the fields and find the largest document frequency
|
||||
String topField = fieldNames[0];
|
||||
int docFreq = 0;
|
||||
for (String fieldName : fieldNames) {
|
||||
int freq = ir.docFreq(new Term(fieldName, word));
|
||||
topField = (freq > docFreq) ? fieldName : topField;
|
||||
docFreq = (freq > docFreq) ? freq : docFreq;
|
||||
}
|
||||
int docFreq = ir.docFreq(new Term(fieldName, word));
|
||||
|
||||
if (minDocFreq > 0 && docFreq < minDocFreq) {
|
||||
continue; // filter out words that don't occur in enough docs
|
||||
}
|
||||
if (minDocFreq > 0 && docFreq < minDocFreq) {
|
||||
continue; // filter out words that don't occur in enough docs
|
||||
}
|
||||
|
||||
if (docFreq > maxDocFreq) {
|
||||
continue; // filter out words that occur in too many docs
|
||||
}
|
||||
if (docFreq > maxDocFreq) {
|
||||
continue; // filter out words that occur in too many docs
|
||||
}
|
||||
|
||||
if (docFreq == 0) {
|
||||
continue; // index update problem?
|
||||
}
|
||||
if (docFreq == 0) {
|
||||
continue; // index update problem?
|
||||
}
|
||||
|
||||
float idf = similarity.idf(docFreq, numDocs);
|
||||
float score = tf * idf;
|
||||
float idf = similarity.idf(docFreq, numDocs);
|
||||
float score = tf * idf;
|
||||
|
||||
if (queue.size() < limit) {
|
||||
// there is still space in the queue
|
||||
queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf));
|
||||
} else {
|
||||
ScoreTerm term = queue.top();
|
||||
if (term.score < score) { // update the smallest in the queue in place and update the queue.
|
||||
term.update(word, topField, score, idf, docFreq, tf);
|
||||
queue.updateTop();
|
||||
if (queue.size() < limit) {
|
||||
// there is still space in the queue
|
||||
queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf));
|
||||
} else {
|
||||
ScoreTerm term = queue.top();
|
||||
if (term.score < score) { // update the smallest in the queue in place and update the queue.
|
||||
term.update(word, fieldName, score, idf, docFreq, tf);
|
||||
queue.updateTop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return queue;
|
||||
}
|
||||
|
||||
private int getTermsCount(Map<String, Map<String, Int>> perFieldTermFrequencies) {
|
||||
int totalTermsCount = 0;
|
||||
Collection<Map<String, Int>> values = perFieldTermFrequencies.values();
|
||||
for (Map<String, Int> perWordTermFrequencies : values) {
|
||||
totalTermsCount += perWordTermFrequencies.size();
|
||||
}
|
||||
return totalTermsCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Describe the parameters that control how the "more like this" query is formed.
|
||||
*/
|
||||
|
@ -721,7 +728,7 @@ public final class MoreLikeThis {
|
|||
* @param docNum the id of the lucene document from which to find terms
|
||||
*/
|
||||
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
|
||||
Map<String, Int> termFreqMap = new HashMap<>();
|
||||
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
|
||||
for (String fieldName : fieldNames) {
|
||||
final Fields vectors = ir.getTermVectors(docNum);
|
||||
final Terms vector;
|
||||
|
@ -738,43 +745,48 @@ public final class MoreLikeThis {
|
|||
for (IndexableField field : fields) {
|
||||
final String stringValue = field.stringValue();
|
||||
if (stringValue != null) {
|
||||
addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
|
||||
addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
addTermFrequencies(termFreqMap, vector);
|
||||
addTermFrequencies(field2termFreqMap, vector, fieldName);
|
||||
}
|
||||
}
|
||||
|
||||
return createQueue(termFreqMap);
|
||||
return createQueue(field2termFreqMap);
|
||||
}
|
||||
|
||||
|
||||
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> fields) throws
|
||||
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues) throws
|
||||
IOException {
|
||||
HashMap<String,Int> termFreqMap = new HashMap<>();
|
||||
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
|
||||
for (String fieldName : fieldNames) {
|
||||
for (String field : fields.keySet()) {
|
||||
Collection<Object> fieldValues = fields.get(field);
|
||||
for (String field : field2fieldValues.keySet()) {
|
||||
Collection<Object> fieldValues = field2fieldValues.get(field);
|
||||
if(fieldValues == null)
|
||||
continue;
|
||||
for(Object fieldValue:fieldValues) {
|
||||
if (fieldValue != null) {
|
||||
addTermFrequencies(new StringReader(String.valueOf(fieldValue)), termFreqMap,
|
||||
addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap,
|
||||
fieldName);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return createQueue(termFreqMap);
|
||||
return createQueue(field2termFreqMap);
|
||||
}
|
||||
/**
|
||||
* Adds terms and frequencies found in vector into the Map termFreqMap
|
||||
*
|
||||
* @param termFreqMap a Map of terms and their frequencies
|
||||
* @param field2termFreqMap a Map of terms and their frequencies per field
|
||||
* @param vector List of terms and their frequencies for a doc/field
|
||||
*/
|
||||
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
|
||||
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
|
||||
Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName);
|
||||
if (termFreqMap == null) {
|
||||
termFreqMap = new HashMap<>();
|
||||
field2termFreqMap.put(fieldName, termFreqMap);
|
||||
}
|
||||
final TermsEnum termsEnum = vector.iterator();
|
||||
final CharsRefBuilder spare = new CharsRefBuilder();
|
||||
BytesRef text;
|
||||
|
@ -802,15 +814,20 @@ public final class MoreLikeThis {
|
|||
* Adds term frequencies found by tokenizing text from reader into the Map words
|
||||
*
|
||||
* @param r a source of text to be tokenized
|
||||
* @param termFreqMap a Map of terms and their frequencies
|
||||
* @param perFieldTermFrequencies a Map of terms and their frequencies per field
|
||||
* @param fieldName Used by analyzer for any special per-field analysis
|
||||
*/
|
||||
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
|
||||
private void addTermFrequencies(Reader r, Map<String, Map<String, Int>> perFieldTermFrequencies, String fieldName)
|
||||
throws IOException {
|
||||
if (analyzer == null) {
|
||||
throw new UnsupportedOperationException("To use MoreLikeThis without " +
|
||||
"term vectors, you must provide an Analyzer");
|
||||
}
|
||||
Map<String, Int> termFreqMap = perFieldTermFrequencies.get(fieldName);
|
||||
if (termFreqMap == null) {
|
||||
termFreqMap = new HashMap<>();
|
||||
perFieldTermFrequencies.put(fieldName, termFreqMap);
|
||||
}
|
||||
try (TokenStream ts = analyzer.tokenStream(fieldName, r)) {
|
||||
int tokenCount = 0;
|
||||
// for every token
|
||||
|
@ -880,9 +897,9 @@ public final class MoreLikeThis {
|
|||
* @see #retrieveInterestingTerms
|
||||
*/
|
||||
private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException {
|
||||
Map<String, Int> words = new HashMap<>();
|
||||
addTermFrequencies(r, words, fieldName);
|
||||
return createQueue(words);
|
||||
Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
|
||||
addTermFrequencies(r, field2termFreqMap, fieldName);
|
||||
return createQueue(field2termFreqMap);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.queries.mlt;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
|
@ -40,8 +41,14 @@ import org.apache.lucene.search.QueryUtils;
|
|||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestMoreLikeThis extends LuceneTestCase {
|
||||
|
||||
private static final String SHOP_TYPE = "type";
|
||||
private static final String FOR_SALE = "weSell";
|
||||
private static final String NOT_FOR_SALE = "weDontSell";
|
||||
|
||||
private Directory directory;
|
||||
private IndexReader reader;
|
||||
private IndexSearcher searcher;
|
||||
|
@ -246,5 +253,80 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
return generatedStrings;
|
||||
}
|
||||
|
||||
private int addShopDoc(RandomIndexWriter writer, String type, String[] weSell, String[] weDontSell) throws IOException {
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField(SHOP_TYPE, type, Field.Store.YES));
|
||||
for (String item : weSell) {
|
||||
doc.add(newTextField(FOR_SALE, item, Field.Store.YES));
|
||||
}
|
||||
for (String item : weDontSell) {
|
||||
doc.add(newTextField(NOT_FOR_SALE, item, Field.Store.YES));
|
||||
}
|
||||
writer.addDocument(doc);
|
||||
return writer.numDocs() - 1;
|
||||
}
|
||||
|
||||
public void testMultiFieldShouldReturnPerFieldBooleanQuery() throws Exception {
|
||||
IndexReader reader = null;
|
||||
Directory dir = newDirectory();
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||
try {
|
||||
int maxQueryTerms = 25;
|
||||
|
||||
String[] itShopItemForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"};
|
||||
String[] itShopItemNotForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"};
|
||||
|
||||
String[] clothesShopItemForSale = new String[]{"tie", "trousers", "shoes", "skirt", "hat"};
|
||||
String[] clothesShopItemNotForSale = new String[]{"watch", "ipod", "asrock", "imac", "macbookpro", "monitor", "keyboard", "mouse", "speakers"};
|
||||
|
||||
// add series of shop docs
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
for (int i = 0; i < 100; i++) {
|
||||
addShopDoc(writer, "it", itShopItemForSale, itShopItemNotForSale);
|
||||
}
|
||||
for (int i = 0; i < 10; i++) {
|
||||
addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
|
||||
}
|
||||
// Input Document is a clothes shop
|
||||
int inputDocId = addShopDoc(writer, "clothes", clothesShopItemForSale, clothesShopItemNotForSale);
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
// setup MLT query
|
||||
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||
|
||||
mlt.setAnalyzer(analyzer);
|
||||
mlt.setMaxQueryTerms(maxQueryTerms);
|
||||
mlt.setMinDocFreq(1);
|
||||
mlt.setMinTermFreq(1);
|
||||
mlt.setMinWordLen(1);
|
||||
mlt.setFieldNames(new String[]{FOR_SALE, NOT_FOR_SALE});
|
||||
|
||||
// perform MLT query
|
||||
BooleanQuery query = (BooleanQuery) mlt.like(inputDocId);
|
||||
Collection<BooleanClause> clauses = query.clauses();
|
||||
|
||||
Collection<BooleanClause> expectedClothesShopClauses = new ArrayList<BooleanClause>();
|
||||
for (String itemForSale : clothesShopItemForSale) {
|
||||
BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(FOR_SALE, itemForSale)), BooleanClause.Occur.SHOULD);
|
||||
expectedClothesShopClauses.add(booleanClause);
|
||||
}
|
||||
for (String itemNotForSale : clothesShopItemNotForSale) {
|
||||
BooleanClause booleanClause = new BooleanClause(new TermQuery(new Term(NOT_FOR_SALE, itemNotForSale)), BooleanClause.Occur.SHOULD);
|
||||
expectedClothesShopClauses.add(booleanClause);
|
||||
}
|
||||
|
||||
for (BooleanClause expectedClause : expectedClothesShopClauses) {
|
||||
assertTrue(clauses.contains(expectedClause));
|
||||
}
|
||||
} finally {
|
||||
// clean up
|
||||
if (reader != null) {
|
||||
reader.close();
|
||||
}
|
||||
dir.close();
|
||||
analyzer.close();
|
||||
}
|
||||
}
|
||||
// TODO: add tests for the MoreLikeThisQuery
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue