mirror of https://github.com/apache/lucene.git
LUCENE-5725: MoreLikeThis#like now accetps multiple values per field
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1599442 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
bb9116c485
commit
2eed3f94d3
|
@ -138,6 +138,10 @@ Changes in Backwards Compatibility Policy
|
|||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5725: MoreLikeThis#like now accetps multiple values per field.
|
||||
The pre-existing method has been deprecated in favor of a variable arguments
|
||||
for the like text. (Alex Ksikes via Simon Willnauer)
|
||||
|
||||
* LUCENE-5711: MergePolicy accepts an IndexWriter instance
|
||||
on each method rather than holding state against a single
|
||||
IndexWriter instance. (Simon Willnauer)
|
||||
|
|
|
@ -84,7 +84,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
}
|
||||
BooleanQuery mltQuery = new BooleanQuery();
|
||||
for (String textFieldName : textFieldNames) {
|
||||
mltQuery.add(new BooleanClause(mlt.like(new StringReader(text), textFieldName), BooleanClause.Occur.SHOULD));
|
||||
mltQuery.add(new BooleanClause(mlt.like(textFieldName, new StringReader(text)), BooleanClause.Occur.SHOULD));
|
||||
}
|
||||
Query classFieldQuery = new WildcardQuery(new Term(classFieldName, "*"));
|
||||
mltQuery.add(new BooleanClause(classFieldQuery, BooleanClause.Occur.MUST));
|
||||
|
|
|
@ -15,23 +15,21 @@
|
|||
*/
|
||||
package org.apache.lucene.queries.mlt;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
import org.apache.lucene.index.StoredDocument;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -39,6 +37,15 @@ import org.apache.lucene.util.CharsRef;
|
|||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
/**
|
||||
* Generate "more like this" similarity queries.
|
||||
|
@ -581,12 +588,17 @@ public final class MoreLikeThis {
|
|||
}
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed Reader.
|
||||
* Return a query that will return docs like the passed Readers.
|
||||
* This was added in order to treat multi-value fields.
|
||||
*
|
||||
* @return a query that will return docs like the passed Reader.
|
||||
* @return a query that will return docs like the passed Readers.
|
||||
*/
|
||||
public Query like(Reader r, String fieldName) throws IOException {
|
||||
return createQuery(retrieveTerms(r, fieldName));
|
||||
public Query like(String fieldName, Reader... readers) throws IOException {
|
||||
Map<String, Int> words = new HashMap<>();
|
||||
for (Reader r : readers) {
|
||||
addTermFrequencies(r, words, fieldName);
|
||||
}
|
||||
return createQuery(createQueue(words));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.search.BooleanClause;
|
|||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
@ -71,7 +70,7 @@ public class MoreLikeThisQuery extends Query {
|
|||
}
|
||||
mlt.setMaxQueryTerms(maxQueryTerms);
|
||||
mlt.setStopWords(stopWords);
|
||||
BooleanQuery bq = (BooleanQuery) mlt.like(new StringReader(likeText), fieldName);
|
||||
BooleanQuery bq = (BooleanQuery) mlt.like(fieldName, new StringReader(likeText));
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
//make at least half the terms match
|
||||
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||
|
|
|
@ -19,17 +19,18 @@ package org.apache.lucene.queries.mlt;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
|
@ -53,6 +54,8 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
// Add series of docs with specific information for MoreLikeThis
|
||||
addDoc(writer, "lucene");
|
||||
addDoc(writer, "lucene release");
|
||||
addDoc(writer, "apache");
|
||||
addDoc(writer, "apache lucene");
|
||||
|
||||
reader = writer.getReader();
|
||||
writer.shutdown();
|
||||
|
@ -88,8 +91,8 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
float boostFactor = 5;
|
||||
mlt.setBoostFactor(boostFactor);
|
||||
|
||||
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
|
||||
"lucene release"), "text");
|
||||
BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(
|
||||
"lucene release"));
|
||||
List<BooleanClause> clauses = query.clauses();
|
||||
|
||||
assertEquals("Expected " + originalValues.size() + " clauses.",
|
||||
|
@ -116,8 +119,8 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
mlt.setMinWordLen(1);
|
||||
mlt.setFieldNames(new String[] {"text"});
|
||||
mlt.setBoost(true);
|
||||
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
|
||||
"lucene release"), "text");
|
||||
BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(
|
||||
"lucene release"));
|
||||
List<BooleanClause> clauses = query.clauses();
|
||||
|
||||
for (BooleanClause clause : clauses) {
|
||||
|
@ -135,9 +138,29 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
mlt.setMinTermFreq(1);
|
||||
mlt.setMinWordLen(1);
|
||||
mlt.setFieldNames(new String[] {"text", "foobar"});
|
||||
mlt.like(new StringReader("this is a test"), "foobar");
|
||||
mlt.like("foobar", new StringReader("this is a test"));
|
||||
}
|
||||
|
||||
|
||||
// LUCENE-5725
|
||||
public void testMultiValues() throws Exception {
|
||||
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||
mlt.setAnalyzer(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
|
||||
mlt.setMinDocFreq(1);
|
||||
mlt.setMinTermFreq(1);
|
||||
mlt.setMinWordLen(1);
|
||||
mlt.setFieldNames(new String[] {"text"});
|
||||
|
||||
BooleanQuery query = (BooleanQuery) mlt.like("text",
|
||||
new StringReader("lucene"), new StringReader("lucene release"),
|
||||
new StringReader("apache"), new StringReader("apache lucene"));
|
||||
List<BooleanClause> clauses = query.clauses();
|
||||
assertEquals("Expected 2 clauses only!", 2, clauses.size());
|
||||
for (BooleanClause clause : clauses) {
|
||||
Term term = ((TermQuery) clause.getQuery()).getTerm();
|
||||
assertTrue(Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term));
|
||||
}
|
||||
}
|
||||
|
||||
// just basic equals/hashcode etc
|
||||
public void testMoreLikeThisQuery() throws Exception {
|
||||
Query query = new MoreLikeThisQuery("this is a test", new String[] { "text" }, new MockAnalyzer(random()), "text");
|
||||
|
|
|
@ -370,7 +370,7 @@ public class MoreLikeThisHandler extends RequestHandlerBase
|
|||
public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
|
||||
{
|
||||
// analyzing with the first field: previous (stupid) behavior
|
||||
rawMLTQuery = mlt.like(reader, mlt.getFieldNames()[0]);
|
||||
rawMLTQuery = mlt.like(mlt.getFieldNames()[0], reader);
|
||||
boostedMLTQuery = getBoostedQuery( rawMLTQuery );
|
||||
if( terms != null ) {
|
||||
fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
|
||||
|
|
Loading…
Reference in New Issue