LUCENE-3271: Moved MoreLikeThis to queries module

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1146542 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christopher John Male 2011-07-14 02:07:11 +00:00
parent 54c5bde246
commit 6b700e1183
11 changed files with 36 additions and 135 deletions

View File

@ -19,5 +19,6 @@
<orderEntry type="module" module-name="analysis-common" />
<orderEntry type="module" module-name="lucene" />
<orderEntry type="module" module-name="queryparser" />
<orderEntry type="module" module-name="queries" />
</component>
</module>

View File

@ -52,6 +52,11 @@
<artifactId>lucene-queries-contrib</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>lucene-queries</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>lucene-queryparser</artifactId>

View File

@ -207,6 +207,7 @@
</fileset>
<pathelement location="${common.dir}/../modules/analysis/build/common/lucene-analyzers-common-${version}.jar"/>
<pathelement location="${common.dir}/../modules/queryparser/build/lucene-queryparser-${version}.jar"/>
<pathelement location="${common.dir}/../modules/queries/build/lucene-queries-${version}.jar"/>
</path>
<invoke-javadoc

View File

@ -1,114 +0,0 @@
/**
* Copyright 2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similar;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
/**
* Simple similarity measures.
*
* @see MoreLikeThis
*/
public final class SimilarityQueries
{
/**
*
*/
private SimilarityQueries()
{
}
/**
* Simple similarity query generators.
* Takes every unique word and forms a boolean query where all words are optional.
* After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
* The only caveat is the first hit returned <b>should be</b> your source document - you'll
* need to then ignore that.
*
* <p>
* So, if you have a code fragment like this:
* <br>
* <code>
* Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
* </code>
*
* <p>
* The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
*
* <p>
* The philosophy behind this method is "two documents are similar if they share lots of words".
* Note that behind the scenes, Lucene's scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
*
* <P>
* This method is fail-safe in that if a long 'body' is passed in and
* {@link BooleanQuery#add BooleanQuery.add()} (used internally)
* throws
* {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
* query as it is will be returned.
*
* @param body the body of the document you want to find similar documents to
* @param a the analyzer to use to parse the body
* @param field the field you want to search on, probably something like "contents" or "body"
* @param stop optional set of stop words to ignore
* @return a query with all unique words in 'body'
* @throws IOException this can't happen...
*/
public static Query formSimilarQuery( String body,
Analyzer a,
String field,
Set<?> stop)
throws IOException
{
TokenStream ts = a.reusableTokenStream( field, new StringReader( body));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
BooleanQuery tmp = new BooleanQuery();
Set<String> already = new HashSet<String>(); // ignore dups
while (ts.incrementToken()) {
String word = termAtt.toString();
// ignore opt stop words
if ( stop != null &&
stop.contains( word)) continue;
// ignore dups
if ( ! already.add( word)) continue;
// add to query
TermQuery tq = new TermQuery( new Term( field, word));
try
{
tmp.add( tq, BooleanClause.Occur.SHOULD);
}
catch( BooleanQuery.TooManyClauses too)
{
// fail-safe, just return what we have, not the end of the world
break;
}
}
return tmp;
}
}

View File

@ -25,35 +25,38 @@
<import file="../contrib-build.xml"/>
<contrib-uptodate name="queries-contrib" contrib-src-name="queries" property="queries.uptodate" classpath.property="queries.jar"/>
<contrib-uptodate name="queries-contrib" contrib-src-name="queries" property="queries-contrib.uptodate" classpath.property="queries-contrib.jar"/>
<property name="lucene.jar" location="${common.dir}/build/lucene-core-${version}.jar"/>
<property name="servlet.jar" location="${common.dir}/lib/servlet-api-2.4.jar"/>
<available property="servlet.jar.present" type="file" file="${servlet.jar}"/>
<module-uptodate name="queryparser" property="queryparser.uptodate" classpath.property="queryparser.jar"/>
<module-uptodate name="queries" property="queries.uptodate" classpath.property="queries.jar"/>
<path id="classpath">
<pathelement path="${queries.jar}"/>
<pathelement path="${queries-contrib.jar}"/>
<pathelement path="${queryparser.jar}"/>
<pathelement path="${queries.jar}"/>
<path refid="base.classpath"/>
</path>
<path id="web-classpath">
<pathelement path="${queries.jar}"/>
<pathelement path="${queries-contrib.jar}"/>
<pathelement path="${queryparser.jar}"/>
<pathelement path="${queries.jar}"/>
<pathelement path="${servlet.jar}"/>
<pathelement path="${build.dir}/${final.name}.jar"/>
<path refid="base.classpath"/>
</path>
<target name="compile-core" depends="build-queries,build-queryparser,common.compile-core" />
<target name="compile-core" depends="build-queries-contrib,build-queryparser,build-queries,common.compile-core" />
<target name="build-queries" unless="queries.uptodate">
<target name="build-queries-contrib" unless="queries-contrib.uptodate">
<echo>XML Parser building dependency ${queries.jar}</echo>
<ant antfile="../queries/build.xml" target="default" inheritall="false" dir="../queries">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
<property name="queries.uptodate" value="true"/>
<property name="queries-contrib.uptodate" value="true"/>
</target>
<!-- override contrib-build.xml target to also build web demo -->
@ -86,10 +89,17 @@
</target>
<target name="build-queryparser" unless="queryparser.uptodate">
<echo>Demo building dependency modules/queryparser</echo>
<echo>XML QueryParser building dependency modules/queryparser</echo>
<subant target="default">
<fileset dir="${common.dir}/../modules/queryparser" includes="build.xml"/>
</subant>
</target>
<target name="build-queries" unless="queryparser.uptodate">
<echo>XML QueryParser building dependency modules/queries</echo>
<subant target="default">
<fileset dir="${common.dir}/../modules/queries" includes="build.xml"/>
</subant>
</target>
</project>

View File

@ -11,7 +11,7 @@ import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.search.similar.MoreLikeThisQuery;
import org.apache.lucene.queries.mlt.MoreLikeThisQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.xmlparser.DOMUtils;
import org.apache.lucene.xmlparser.ParserException;

View File

@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similar;
package org.apache.lucene.queries.mlt;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;

View File

@ -1,7 +1,7 @@
/*
* Created on 25-Jan-2006
*/
package org.apache.lucene.search.similar;
package org.apache.lucene.queries.mlt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -1,4 +1,4 @@
package org.apache.lucene.search.similar;
package org.apache.lucene.queries.mlt;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -92,14 +92,13 @@ public class TestMoreLikeThis extends LuceneTestCase {
assertEquals("Expected " + originalValues.size() + " clauses.",
originalValues.size(), clauses.size());
for (int i = 0; i < clauses.size(); i++) {
BooleanClause clause = clauses.get(i);
for (BooleanClause clause : clauses) {
TermQuery tq = (TermQuery) clause.getQuery();
Float termBoost = originalValues.get(tq.getTerm().text());
assertNotNull("Expected term " + tq.getTerm().text(), termBoost);
float totalBoost = termBoost.floatValue() * boostFactor;
float totalBoost = termBoost * boostFactor;
assertEquals("Expected boost of " + totalBoost + " for term '"
+ tq.getTerm().text() + "' got " + tq.getBoost(), totalBoost, tq
.getBoost(), 0.0001);
@ -118,11 +117,10 @@ public class TestMoreLikeThis extends LuceneTestCase {
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
"lucene release"));
List<BooleanClause> clauses = query.clauses();
for (int i = 0; i < clauses.size(); i++) {
BooleanClause clause = clauses.get(i);
for (BooleanClause clause : clauses) {
TermQuery tq = (TermQuery) clause.getQuery();
originalValues.put(tq.getTerm().text(), Float.valueOf(tq.getBoost()));
originalValues.put(tq.getTerm().text(), tq.getBoost());
}
return originalValues;
}

View File

@ -34,7 +34,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.FacetParams;