LUCENE-5725: MoreLikeThis#like now accetps multiple values per field

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1599442 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2014-06-03 07:51:55 +00:00
parent bb9116c485
commit 2eed3f94d3
6 changed files with 59 additions and 21 deletions

View File

@ -138,6 +138,10 @@ Changes in Backwards Compatibility Policy
API Changes
* LUCENE-5725: MoreLikeThis#like now accetps multiple values per field.
The pre-existing method has been deprecated in favor of a variable arguments
for the like text. (Alex Ksikes via Simon Willnauer)
* LUCENE-5711: MergePolicy accepts an IndexWriter instance
on each method rather than holding state against a single
IndexWriter instance. (Simon Willnauer)

View File

@ -84,7 +84,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
}
BooleanQuery mltQuery = new BooleanQuery();
for (String textFieldName : textFieldNames) {
mltQuery.add(new BooleanClause(mlt.like(new StringReader(text), textFieldName), BooleanClause.Occur.SHOULD));
mltQuery.add(new BooleanClause(mlt.like(textFieldName, new StringReader(text)), BooleanClause.Occur.SHOULD));
}
Query classFieldQuery = new WildcardQuery(new Term(classFieldName, "*"));
mltQuery.add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));

View File

@ -15,23 +15,21 @@
*/
package org.apache.lucene.queries.mlt;
import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.StorableField;
import org.apache.lucene.index.StoredDocument;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
@ -39,6 +37,15 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.UnicodeUtil;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
/**
* Generate "more like this" similarity queries.
@ -581,12 +588,17 @@ public final class MoreLikeThis {
}
/**
* Return a query that will return docs like the passed Reader.
* Return a query that will return docs like the passed Readers.
* This was added in order to treat multi-value fields.
*
* @return a query that will return docs like the passed Reader.
* @return a query that will return docs like the passed Readers.
*/
public Query like(Reader r, String fieldName) throws IOException {
return createQuery(retrieveTerms(r, fieldName));
public Query like(String fieldName, Reader... readers) throws IOException {
Map<String, Int> words = new HashMap<>();
for (Reader r : readers) {
addTermFrequencies(r, words, fieldName);
}
return createQuery(createQueue(words));
}
/**

View File

@ -26,7 +26,6 @@ import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
@ -71,7 +70,7 @@ public class MoreLikeThisQuery extends Query {
}
mlt.setMaxQueryTerms(maxQueryTerms);
mlt.setStopWords(stopWords);
BooleanQuery bq = (BooleanQuery) mlt.like(new StringReader(likeText), fieldName);
BooleanQuery bq = (BooleanQuery) mlt.like(fieldName, new StringReader(likeText));
BooleanClause[] clauses = bq.getClauses();
//make at least half the terms match
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));

View File

@ -19,17 +19,18 @@ package org.apache.lucene.queries.mlt;
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
@ -53,6 +54,8 @@ public class TestMoreLikeThis extends LuceneTestCase {
// Add series of docs with specific information for MoreLikeThis
addDoc(writer, "lucene");
addDoc(writer, "lucene release");
addDoc(writer, "apache");
addDoc(writer, "apache lucene");
reader = writer.getReader();
writer.shutdown();
@ -88,8 +91,8 @@ public class TestMoreLikeThis extends LuceneTestCase {
float boostFactor = 5;
mlt.setBoostFactor(boostFactor);
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
"lucene release"), "text");
BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(
"lucene release"));
List<BooleanClause> clauses = query.clauses();
assertEquals("Expected " + originalValues.size() + " clauses.",
@ -116,8 +119,8 @@ public class TestMoreLikeThis extends LuceneTestCase {
mlt.setMinWordLen(1);
mlt.setFieldNames(new String[] {"text"});
mlt.setBoost(true);
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
"lucene release"), "text");
BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(
"lucene release"));
List<BooleanClause> clauses = query.clauses();
for (BooleanClause clause : clauses) {
@ -135,9 +138,29 @@ public class TestMoreLikeThis extends LuceneTestCase {
mlt.setMinTermFreq(1);
mlt.setMinWordLen(1);
mlt.setFieldNames(new String[] {"text", "foobar"});
mlt.like(new StringReader("this is a test"), "foobar");
mlt.like("foobar", new StringReader("this is a test"));
}
// LUCENE-5725
public void testMultiValues() throws Exception {
MoreLikeThis mlt = new MoreLikeThis(reader);
mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
mlt.setMinDocFreq(1);
mlt.setMinTermFreq(1);
mlt.setMinWordLen(1);
mlt.setFieldNames(new String[] {"text"});
BooleanQuery query = (BooleanQuery) mlt.like("text",
new StringReader("lucene"), new StringReader("lucene release"),
new StringReader("apache"), new StringReader("apache lucene"));
List<BooleanClause> clauses = query.clauses();
assertEquals("Expected 2 clauses only!", 2, clauses.size());
for (BooleanClause clause : clauses) {
Term term = ((TermQuery) clause.getQuery()).getTerm();
assertTrue(Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term));
}
}
// just basic equals/hashcode etc
public void testMoreLikeThisQuery() throws Exception {
Query query = new MoreLikeThisQuery("this is a test", new String[] { "text" }, new MockAnalyzer(random()), "text");

View File

@ -370,7 +370,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase
public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
{
// analyzing with the first field: previous (stupid) behavior
rawMLTQuery = mlt.like(reader, mlt.getFieldNames()[0]);
rawMLTQuery = mlt.like(mlt.getFieldNames()[0], reader);
boostedMLTQuery = getBoostedQuery( rawMLTQuery );
if( terms != null ) {
fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );