SOLR-69: Adding MoreLikeThis support

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@544018 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2007-06-04 00:27:20 +00:00
parent 1285962e95
commit 4ccd436bf8
8 changed files with 497 additions and 2 deletions

View File

@ -29,7 +29,10 @@ Detailed Change List
--------------------
New Features
1. SOLR-69: Adding MoreLikeThisHandler to search for similar documents using
lucene contrib/queries MoreLikeThis. MoreLikeThis is also avaliable from
the StandardRequestHandler using ?mlt=true. (bdelacretaz, ryan)
Changes in runtime behavior
Optimizations

View File

@ -237,6 +237,9 @@
this field (this disables length normalization and index-time
boosting for the field, and saves some memory). Only full-text
fields or fields that need an index-time boost need norms.
termVectors: [false] set to true to store the term vector for a given field.
When using MoreLikeThis, fields used for similarity should be stored for
best performance.
-->
<field name="id" type="string" indexed="true" stored="true" required="true" />
@ -245,7 +248,7 @@
<field name="nameSort" type="string" indexed="true" stored="false"/>
<field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
<field name="manu" type="text" indexed="true" stored="true" omitNorms="true"/>
<field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true"/>
<field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" termVectors="true" />
<field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="includes" type="text" indexed="true" stored="true"/>

View File

@ -387,6 +387,12 @@
</requestHandler>
<requestHandler name="/mlt" class="solr.MoreLikeThisHandler">
<lst name="defaults">
<str name="mlt.fl">manu,cat</str>
<int name="mlt.mindf">1</int>
</lst>
</requestHandler>
<!-- Update request handler.

View File

@ -0,0 +1,2 @@
AnyObjectId[8d16f425b5e508500f25787286b14dcac908aeb1] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,347 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler;
import static org.apache.solr.request.SolrParams.DF;
import static org.apache.solr.request.SolrParams.FACET;
import static org.apache.solr.request.SolrParams.FQ;
import java.io.IOException;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrException;
import org.apache.solr.request.SimpleFacets;
import org.apache.solr.request.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.ContentStream;
import org.apache.solr.util.MoreLikeThisParams;
import org.apache.solr.util.NamedList;
import org.apache.solr.util.SimpleOrderedMap;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.MoreLikeThisParams.TermStyle;
/**
* Solr MoreLikeThis --
*
* Return similar documents either based on a single document or based on posted text.
*
* @since solr 1.3
*/
public class MoreLikeThisHandler extends RequestHandlerBase
{
// Pattern is thread safe -- TODO? share this with general 'fl' param
private static final Pattern splitList = Pattern.compile(",| ");
@Override
public void init(NamedList args) {
super.init(args);
}
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception
{
RequestHandlerUtils.addExperimentalFormatWarning( rsp );
SolrParams params = req.getParams();
SolrIndexSearcher searcher = req.getSearcher();
// Parse Required Params
// This will either have a single Reader or valid query
Reader reader = null;
String q = params.get( SolrParams.Q );
if( q == null || q.trim().length() <1 ) {
Iterable<ContentStream> streams = req.getContentStreams();
if( streams != null ) {
Iterator<ContentStream> iter = streams.iterator();
if( iter.hasNext() ) {
reader = iter.next().getReader();
}
if( iter.hasNext() ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis does not support multiple ContentStreams" );
}
}
}
MoreLikeThisHelper mlt = new MoreLikeThisHelper( params, searcher );
List<Query> filters = SolrPluginUtils.parseFilterQueries(req);
// Hold on to the interesting terms if relevant
TermStyle termStyle = TermStyle.get( params.get( MoreLikeThisParams.INTERESTING_TERMS ) );
List<InterestingTerm> interesting = (termStyle == TermStyle.NONE )
? null : new ArrayList<InterestingTerm>( mlt.mlt.getMaxQueryTerms() );
// What fields do we need to return
String fl = params.get(SolrParams.FL);
int flags = 0;
if (fl != null) {
flags |= SolrPluginUtils.setReturnFields(fl, rsp);
}
int start = params.getInt( SolrParams.START, 0 );
int rows = params.getInt( SolrParams.ROWS, 10 );
DocList mltDocs = null;
// Find documents MoreLikeThis - either with a reader or a query
//--------------------------------------------------------------------------------
if( reader != null ) {
mltDocs = mlt.getMoreLikeThis( reader, start, rows, filters, interesting, flags );
}
else if( q != null ) {
// Matching options
boolean includeMatch = params.getBool( MoreLikeThisParams.MATCH_INCLUDE, true );
int matchOffset = params.getInt( MoreLikeThisParams.MATCH_OFFSET, 0 );
// Find the base match
Query query = QueryParsing.parseQuery(q, params.get(DF), params, req.getSchema());
DocList match = searcher.getDocList(query, null, null, matchOffset, 1, flags ); // only get the first one...
if( includeMatch ) {
rsp.add( "match", match );
}
// This is an iterator, but we only handle the first match
DocIterator iterator = match.iterator();
if( iterator.hasNext() ) {
// do a MoreLikeThis query for each document in results
int id = iterator.nextDoc();
mltDocs = mlt.getMoreLikeThis( id, start, rows, filters, interesting, flags );
}
}
else {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis requires either a query (?q=) or text to find similar documents." );
}
rsp.add( "response", mltDocs );
if( interesting != null ) {
if( termStyle == TermStyle.DETAILS ) {
NamedList<Float> it = new NamedList<Float>();
for( InterestingTerm t : interesting ) {
it.add( t.term.toString(), t.boost );
}
rsp.add( "interestingTerms", it );
}
else {
List<String> it = new ArrayList<String>( interesting.size() );
for( InterestingTerm t : interesting ) {
it.add( t.term.text());
}
rsp.add( "interestingTerms", it );
}
}
// maybe facet the results
if (params.getBool(FACET,false)) {
SimpleFacets f = new SimpleFacets(searcher, mltDocs, params );
rsp.add( "facet_counts", f.getFacetCounts() );
}
// Copied from StandardRequestHandler... perhaps it should be added to doStandardDebug?
try {
NamedList<Object> dbg = SolrPluginUtils.doStandardDebug(req, q, mlt.mltquery, mltDocs );
if (null != dbg) {
if (null != filters) {
dbg.add("filter_queries",req.getParams().getParams(FQ));
List<String> fqs = new ArrayList<String>(filters.size());
for (Query fq : filters) {
fqs.add(QueryParsing.toString(fq, req.getSchema()));
}
dbg.add("parsed_filter_queries",fqs);
}
rsp.add("debug", dbg);
}
} catch (Exception e) {
SolrException.logOnce(SolrCore.log, "Exception during debug", e);
rsp.add("exception_during_debug", SolrException.toStr(e));
}
}
public static class InterestingTerm
{
public Term term;
public float boost;
public static Comparator<InterestingTerm> BOOST_ORDER = new Comparator<InterestingTerm>() {
public int compare(InterestingTerm t1, InterestingTerm t2) {
float d = t1.boost - t2.boost;
if( d == 0 ) {
return 0;
}
return (d>0)?1:-1;
}
};
}
/**
* Helper class for MoreLikeThis that can be called from other request handlers
*/
public static class MoreLikeThisHelper
{
final SolrIndexSearcher searcher;
final MoreLikeThis mlt;
final IndexReader reader;
final SchemaField uniqueKeyField;
Query mltquery; // expose this for debugging
public MoreLikeThisHelper( SolrParams params, SolrIndexSearcher searcher )
{
this.searcher = searcher;
this.reader = searcher.getReader();
this.uniqueKeyField = searcher.getSchema().getUniqueKeyField();
SolrParams required = params.required();
String[] fields = splitList.split( required.get(MoreLikeThisParams.SIMILARITY_FIELDS) );
if( fields.length < 1 ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST,
"MoreLikeThis requires at least one similarity field: "+MoreLikeThisParams.SIMILARITY_FIELDS );
}
this.mlt = new MoreLikeThis( reader ); // TODO -- after LUCENE-896, we can use , searcher.getSimilarity() );
mlt.setFieldNames(fields);
mlt.setAnalyzer( searcher.getSchema().getAnalyzer() );
// configurable params
mlt.setMinTermFreq( params.getInt(MoreLikeThisParams.MIN_TERM_FREQ, MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
mlt.setMinDocFreq( params.getInt(MoreLikeThisParams.MIN_DOC_FREQ, MoreLikeThis.DEFAULT_MIN_DOC_FREQ));
mlt.setMinWordLen( params.getInt(MoreLikeThisParams.MIN_WORD_LEN, MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
mlt.setMaxWordLen( params.getInt(MoreLikeThisParams.MAX_WORD_LEN, MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
mlt.setMaxQueryTerms( params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS, MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
mlt.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
mlt.setBoost( params.getBool(MoreLikeThisParams.BOOST, false ) );
}
public DocList getMoreLikeThis( int id, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
{
Document doc = reader.document(id);
mltquery = mlt.like(id);
if( terms != null ) {
fillInteristingTermsFromMLTQuery( mltquery, terms );
}
// exclude current document from results
BooleanQuery mltQuery = new BooleanQuery();
mltQuery.add(mltquery, BooleanClause.Occur.MUST);
mltQuery.add(
new TermQuery(new Term(uniqueKeyField.getName(), doc.get(uniqueKeyField.getName()))),
BooleanClause.Occur.MUST_NOT);
return searcher.getDocList(mltQuery, filters, null, start, rows, flags);
}
public DocList getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
{
mltquery = mlt.like(reader);
if( terms != null ) {
fillInteristingTermsFromMLTQuery( mltquery, terms );
}
return searcher.getDocList(mltquery, filters, null, start, rows, flags);
}
public NamedList<DocList> getMoreLikeThese( DocList docs, int rows, int flags ) throws IOException
{
IndexSchema schema = searcher.getSchema();
NamedList<DocList> mlt = new SimpleOrderedMap<DocList>();
DocIterator iterator = docs.iterator();
while( iterator.hasNext() ) {
int id = iterator.nextDoc();
DocList sim = getMoreLikeThis( id, 0, rows, null, null, flags );
String name = schema.printableUniqueKey( reader.document( id ) );
mlt.add(name, sim);
}
return mlt;
}
private void fillInteristingTermsFromMLTQuery( Query query, List<InterestingTerm> terms )
{
List clauses = ((BooleanQuery)mltquery).clauses();
for( Object o : clauses ) {
TermQuery q = (TermQuery)((BooleanClause)o).getQuery();
InterestingTerm it = new InterestingTerm();
it.boost = q.getBoost();
it.term = q.getTerm();
terms.add( it );
}
// alternatively we could use
// mltquery.extractTerms( terms );
}
public MoreLikeThis getMoreLikeThis()
{
return mlt;
}
}
//////////////////////// SolrInfoMBeans methods //////////////////////
@Override
public String getVersion() {
return "$Revision$";
}
@Override
public String getDescription() {
return "Solr MoreLikeThis";
}
@Override
public String getSourceId() {
return "$Id$";
}
@Override
public String getSource() {
return "$URL$";
}
@Override
public URL[] getDocs() {
try {
return new URL[] { new URL("http://wiki.apache.org/solr/MoreLikeThis") };
}
catch( MalformedURLException ex ) { return null; }
}
}

View File

@ -24,6 +24,7 @@ import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import org.apache.solr.util.MoreLikeThisParams;
import org.apache.solr.util.StrUtils;
import org.apache.solr.util.NamedList;
import org.apache.solr.util.HighlightingUtils;
@ -31,6 +32,7 @@ import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.search.*;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrException;
import org.apache.solr.handler.MoreLikeThisHandler;
import org.apache.solr.handler.RequestHandlerBase;
import static org.apache.solr.request.SolrParams.*;
@ -131,6 +133,14 @@ public class StandardRequestHandler extends RequestHandlerBase {
if (null != facetInfo) rsp.add("facet_counts", facetInfo);
// Include "More Like This" results for *each* result
if( p.getBool( MoreLikeThisParams.MLT, false ) ) {
MoreLikeThisHandler.MoreLikeThisHelper mlt
= new MoreLikeThisHandler.MoreLikeThisHelper( p, s );
int mltcount = p.getInt( MoreLikeThisParams.DOC_COUNT, 5 );
rsp.add( "moreLikeThis", mlt.getMoreLikeThese(results.docList, mltcount, flags));
}
try {
NamedList dbg = U.doStandardDebug(req, qstr, query, results.docList);
if (null != dbg) {
@ -203,3 +213,6 @@ public class StandardRequestHandler extends RequestHandlerBase {

View File

@ -0,0 +1,50 @@
package org.apache.solr.util;
public interface MoreLikeThisParams
{
// enable more like this -- this only applies to 'StandardRequestHandler' maybe DismaxRequestHandler
public final static String MLT = "mlt";
public final static String PREFIX = "mlt.";
public final static String SIMILARITY_FIELDS = PREFIX + "fl";
public final static String MIN_TERM_FREQ = PREFIX + "mintf";
public final static String MIN_DOC_FREQ = PREFIX + "mindf";
public final static String MIN_WORD_LEN = PREFIX + "minwl";
public final static String MAX_WORD_LEN = PREFIX + "maxwl";
public final static String MAX_QUERY_TERMS = PREFIX + "maxqt";
public final static String MAX_NUM_TOKENS_PARSED = PREFIX + "maxntp";
public final static String BOOST = PREFIX + "boost"; // boost or not?
// the /mlt request handler uses 'rows'
public final static String DOC_COUNT = PREFIX + "count";
// Do you want to include the original document in the results or not
public final static String MATCH_INCLUDE = PREFIX + "match.include";
// If multiple docs are matched in the query, what offset do you want?
public final static String MATCH_OFFSET = PREFIX + "match.offset";
// Do you want to include the original document in the results or not
public final static String INTERESTING_TERMS = PREFIX + "interestingTerms"; // false,details,(list or true)
public enum TermStyle {
NONE,
LIST,
DETAILS;
public static TermStyle get( String p )
{
if( p != null ) {
p = p.toUpperCase();
if( p.equals( "DETAILS" ) ) {
return DETAILS;
}
else if( p.equals( "LIST" ) ) {
return LIST;
}
}
return NONE;
}
}
}

View File

@ -0,0 +1,71 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.MultiMapSolrParams;
import org.apache.solr.request.SolrParams;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.ContentStream;
import org.apache.solr.util.ContentStreamBase;
/**
* TODO -- this needs to actually test the results/query etc
*/
public class MoreLikeThisHandlerTest extends AbstractSolrTestCase {
@Override public String getSchemaFile() { return "schema.xml"; }
@Override public String getSolrConfigFile() { return "solrconfig.xml"; }
@Override public void setUp() throws Exception {
super.setUp();
lrf = h.getRequestFactory("standard", 0, 20 );
}
public void testInterface()
{
MoreLikeThisHandler mlt = new MoreLikeThisHandler();
SolrCore core = SolrCore.getSolrCore();
Map<String,String[]> params = new HashMap<String,String[]>();
MultiMapSolrParams mmparams = new MultiMapSolrParams( params );
SolrQueryRequestBase req = new SolrQueryRequestBase( core, (SolrParams)mmparams ) {};
// requires 'q' or single content stream
try {
mlt.handleRequestBody( req, new SolrQueryResponse() );
}
catch( Exception ex ) {} // expected
// requires 'q' or single content stream
try {
ArrayList<ContentStream> streams = new ArrayList<ContentStream>( 2 );
streams.add( new ContentStreamBase.StringStream( "hello" ) );
streams.add( new ContentStreamBase.StringStream( "there" ) );
req.setContentStreams( streams );
mlt.handleRequestBody( req, new SolrQueryResponse() );
}
catch( Exception ex ) {} // expected
}
}