mirror of https://github.com/apache/lucene.git
LUCENE-3271: Moved MoreLikeThis to queries module
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1146542 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
54c5bde246
commit
6b700e1183
|
@ -19,5 +19,6 @@
|
|||
<orderEntry type="module" module-name="analysis-common" />
|
||||
<orderEntry type="module" module-name="lucene" />
|
||||
<orderEntry type="module" module-name="queryparser" />
|
||||
<orderEntry type="module" module-name="queries" />
|
||||
</component>
|
||||
</module>
|
||||
|
|
|
@ -52,6 +52,11 @@
|
|||
<artifactId>lucene-queries-contrib</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>lucene-queries</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>${project.groupId}</groupId>
|
||||
<artifactId>lucene-queryparser</artifactId>
|
||||
|
|
|
@ -207,6 +207,7 @@
|
|||
</fileset>
|
||||
<pathelement location="${common.dir}/../modules/analysis/build/common/lucene-analyzers-common-${version}.jar"/>
|
||||
<pathelement location="${common.dir}/../modules/queryparser/build/lucene-queryparser-${version}.jar"/>
|
||||
<pathelement location="${common.dir}/../modules/queries/build/lucene-queries-${version}.jar"/>
|
||||
</path>
|
||||
|
||||
<invoke-javadoc
|
||||
|
|
|
@ -1,114 +0,0 @@
|
|||
/**
|
||||
* Copyright 2004 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similar;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
||||
/**
|
||||
* Simple similarity measures.
|
||||
*
|
||||
* @see MoreLikeThis
|
||||
*/
|
||||
public final class SimilarityQueries
|
||||
{
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private SimilarityQueries()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple similarity query generators.
|
||||
* Takes every unique word and forms a boolean query where all words are optional.
|
||||
* After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
|
||||
* The only caveat is the first hit returned <b>should be</b> your source document - you'll
|
||||
* need to then ignore that.
|
||||
*
|
||||
* <p>
|
||||
* So, if you have a code fragment like this:
|
||||
* <br>
|
||||
* <code>
|
||||
* Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
|
||||
* </code>
|
||||
*
|
||||
* <p>
|
||||
* The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
|
||||
*
|
||||
* <p>
|
||||
* The philosophy behind this method is "two documents are similar if they share lots of words".
|
||||
* Note that behind the scenes, Lucene's scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
|
||||
*
|
||||
* <P>
|
||||
* This method is fail-safe in that if a long 'body' is passed in and
|
||||
* {@link BooleanQuery#add BooleanQuery.add()} (used internally)
|
||||
* throws
|
||||
* {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
|
||||
* query as it is will be returned.
|
||||
*
|
||||
* @param body the body of the document you want to find similar documents to
|
||||
* @param a the analyzer to use to parse the body
|
||||
* @param field the field you want to search on, probably something like "contents" or "body"
|
||||
* @param stop optional set of stop words to ignore
|
||||
* @return a query with all unique words in 'body'
|
||||
* @throws IOException this can't happen...
|
||||
*/
|
||||
public static Query formSimilarQuery( String body,
|
||||
Analyzer a,
|
||||
String field,
|
||||
Set<?> stop)
|
||||
throws IOException
|
||||
{
|
||||
TokenStream ts = a.reusableTokenStream( field, new StringReader( body));
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
|
||||
BooleanQuery tmp = new BooleanQuery();
|
||||
Set<String> already = new HashSet<String>(); // ignore dups
|
||||
while (ts.incrementToken()) {
|
||||
String word = termAtt.toString();
|
||||
// ignore opt stop words
|
||||
if ( stop != null &&
|
||||
stop.contains( word)) continue;
|
||||
// ignore dups
|
||||
if ( ! already.add( word)) continue;
|
||||
// add to query
|
||||
TermQuery tq = new TermQuery( new Term( field, word));
|
||||
try
|
||||
{
|
||||
tmp.add( tq, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
catch( BooleanQuery.TooManyClauses too)
|
||||
{
|
||||
// fail-safe, just return what we have, not the end of the world
|
||||
break;
|
||||
}
|
||||
}
|
||||
return tmp;
|
||||
}
|
||||
}
|
|
@ -25,35 +25,38 @@
|
|||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
<contrib-uptodate name="queries-contrib" contrib-src-name="queries" property="queries.uptodate" classpath.property="queries.jar"/>
|
||||
<contrib-uptodate name="queries-contrib" contrib-src-name="queries" property="queries-contrib.uptodate" classpath.property="queries-contrib.jar"/>
|
||||
<property name="lucene.jar" location="${common.dir}/build/lucene-core-${version}.jar"/>
|
||||
<property name="servlet.jar" location="${common.dir}/lib/servlet-api-2.4.jar"/>
|
||||
<available property="servlet.jar.present" type="file" file="${servlet.jar}"/>
|
||||
<module-uptodate name="queryparser" property="queryparser.uptodate" classpath.property="queryparser.jar"/>
|
||||
<module-uptodate name="queries" property="queries.uptodate" classpath.property="queries.jar"/>
|
||||
|
||||
|
||||
<path id="classpath">
|
||||
<pathelement path="${queries.jar}"/>
|
||||
<pathelement path="${queries-contrib.jar}"/>
|
||||
<pathelement path="${queryparser.jar}"/>
|
||||
<pathelement path="${queries.jar}"/>
|
||||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
<path id="web-classpath">
|
||||
<pathelement path="${queries.jar}"/>
|
||||
<pathelement path="${queries-contrib.jar}"/>
|
||||
<pathelement path="${queryparser.jar}"/>
|
||||
<pathelement path="${queries.jar}"/>
|
||||
<pathelement path="${servlet.jar}"/>
|
||||
<pathelement path="${build.dir}/${final.name}.jar"/>
|
||||
<path refid="base.classpath"/>
|
||||
</path>
|
||||
|
||||
<target name="compile-core" depends="build-queries,build-queryparser,common.compile-core" />
|
||||
<target name="compile-core" depends="build-queries-contrib,build-queryparser,build-queries,common.compile-core" />
|
||||
|
||||
<target name="build-queries" unless="queries.uptodate">
|
||||
<target name="build-queries-contrib" unless="queries-contrib.uptodate">
|
||||
<echo>XML Parser building dependency ${queries.jar}</echo>
|
||||
<ant antfile="../queries/build.xml" target="default" inheritall="false" dir="../queries">
|
||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
||||
</ant>
|
||||
<property name="queries.uptodate" value="true"/>
|
||||
<property name="queries-contrib.uptodate" value="true"/>
|
||||
</target>
|
||||
|
||||
<!-- override contrib-build.xml target to also build web demo -->
|
||||
|
@ -86,10 +89,17 @@
|
|||
</target>
|
||||
|
||||
<target name="build-queryparser" unless="queryparser.uptodate">
|
||||
<echo>Demo building dependency modules/queryparser</echo>
|
||||
<echo>XML QueryParser building dependency modules/queryparser</echo>
|
||||
<subant target="default">
|
||||
<fileset dir="${common.dir}/../modules/queryparser" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="build-queries" unless="queryparser.uptodate">
|
||||
<echo>XML QueryParser building dependency modules/queries</echo>
|
||||
<subant target="default">
|
||||
<fileset dir="${common.dir}/../modules/queries" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -11,7 +11,7 @@ import java.util.Set;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.search.similar.MoreLikeThisQuery;
|
||||
import org.apache.lucene.queries.mlt.MoreLikeThisQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.xmlparser.DOMUtils;
|
||||
import org.apache.lucene.xmlparser.ParserException;
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similar;
|
||||
package org.apache.lucene.queries.mlt;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
* Created on 25-Jan-2006
|
||||
*/
|
||||
package org.apache.lucene.search.similar;
|
||||
package org.apache.lucene.queries.mlt;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.search.similar;
|
||||
package org.apache.lucene.queries.mlt;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -92,14 +92,13 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
|
||||
assertEquals("Expected " + originalValues.size() + " clauses.",
|
||||
originalValues.size(), clauses.size());
|
||||
|
||||
for (int i = 0; i < clauses.size(); i++) {
|
||||
BooleanClause clause = clauses.get(i);
|
||||
|
||||
for (BooleanClause clause : clauses) {
|
||||
TermQuery tq = (TermQuery) clause.getQuery();
|
||||
Float termBoost = originalValues.get(tq.getTerm().text());
|
||||
assertNotNull("Expected term " + tq.getTerm().text(), termBoost);
|
||||
|
||||
float totalBoost = termBoost.floatValue() * boostFactor;
|
||||
|
||||
float totalBoost = termBoost * boostFactor;
|
||||
assertEquals("Expected boost of " + totalBoost + " for term '"
|
||||
+ tq.getTerm().text() + "' got " + tq.getBoost(), totalBoost, tq
|
||||
.getBoost(), 0.0001);
|
||||
|
@ -118,11 +117,10 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
|
||||
"lucene release"));
|
||||
List<BooleanClause> clauses = query.clauses();
|
||||
|
||||
for (int i = 0; i < clauses.size(); i++) {
|
||||
BooleanClause clause = clauses.get(i);
|
||||
|
||||
for (BooleanClause clause : clauses) {
|
||||
TermQuery tq = (TermQuery) clause.getQuery();
|
||||
originalValues.put(tq.getTerm().text(), Float.valueOf(tq.getBoost()));
|
||||
originalValues.put(tq.getTerm().text(), tq.getBoost());
|
||||
}
|
||||
return originalValues;
|
||||
}
|
|
@ -34,7 +34,7 @@ import org.apache.lucene.index.IndexReader;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.queryparser.classic.ParseException;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.similar.MoreLikeThis;
|
||||
import org.apache.lucene.queries.mlt.MoreLikeThis;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.FacetParams;
|
||||
|
|
Loading…
Reference in New Issue