mirror of https://github.com/apache/lucene.git
LUCENE-3326: MoreLikeThis reuses a reader after it has already closed it
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1147881 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1a9c947c2d
commit
07bfe50eda
|
@ -83,6 +83,7 @@ New Features
|
||||||
Removed contrib/wordnet. (Robert Muir, Mike McCandless)
|
Removed contrib/wordnet. (Robert Muir, Mike McCandless)
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
||||||
* LUCENE-3296: PKIndexSplitter & MultiPassIndexSplitter now have version
|
* LUCENE-3296: PKIndexSplitter & MultiPassIndexSplitter now have version
|
||||||
constructors. PKIndexSplitter accepts a IndexWriterConfig for each of
|
constructors. PKIndexSplitter accepts a IndexWriterConfig for each of
|
||||||
the target indexes. (Simon Willnauer, Jason Rutherglen)
|
the target indexes. (Simon Willnauer, Jason Rutherglen)
|
||||||
|
@ -95,6 +96,12 @@ Optimizations
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
|
* LUCENE-3326: Fixed bug if you used MoreLikeThis.like(Reader), it would
|
||||||
|
try to re-analyze the same Reader multiple times, passing different
|
||||||
|
field names to the analyzer. Additionally MoreLikeThisQuery would take
|
||||||
|
your string and encode/decode it using the default charset, it now uses
|
||||||
|
a StringReader. Finally, MoreLikeThis's methods that take File, URL, InputStream,
|
||||||
|
are deprecated, please create the Reader yourself. (Trejkaz, Robert Muir)
|
||||||
|
|
||||||
======================= Lucene 3.3.0 =======================
|
======================= Lucene 3.3.0 =======================
|
||||||
|
|
||||||
|
|
|
@ -96,7 +96,7 @@ public class LikeThisQueryBuilder implements QueryBuilder {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer);
|
MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer, fields[0]);
|
||||||
mlt.setMaxQueryTerms(DOMUtils.getAttribute(e,"maxQueryTerms",defaultMaxQueryTerms));
|
mlt.setMaxQueryTerms(DOMUtils.getAttribute(e,"maxQueryTerms",defaultMaxQueryTerms));
|
||||||
mlt.setMinTermFrequency(DOMUtils.getAttribute(e,"minTermFrequency",defaultMinTermFrequency));
|
mlt.setMinTermFrequency(DOMUtils.getAttribute(e,"minTermFrequency",defaultMinTermFrequency));
|
||||||
mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e,"percentTermsToMatch",defaultPercentTermsToMatch)/100);
|
mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e,"percentTermsToMatch",defaultPercentTermsToMatch)/100);
|
||||||
|
|
|
@ -573,46 +573,13 @@ public final class MoreLikeThis {
|
||||||
return createQuery(retrieveTerms(docNum));
|
return createQuery(retrieveTerms(docNum));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a query that will return docs like the passed file.
|
|
||||||
*
|
|
||||||
* @return a query that will return docs like the passed file.
|
|
||||||
*/
|
|
||||||
public Query like(File f) throws IOException {
|
|
||||||
if (fieldNames == null) {
|
|
||||||
// gather list of valid fields from lucene
|
|
||||||
Collection<String> fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
|
|
||||||
fieldNames = fields.toArray(new String[fields.size()]);
|
|
||||||
}
|
|
||||||
|
|
||||||
return like(new FileReader(f));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a query that will return docs like the passed URL.
|
|
||||||
*
|
|
||||||
* @return a query that will return docs like the passed URL.
|
|
||||||
*/
|
|
||||||
public Query like(URL u) throws IOException {
|
|
||||||
return like(new InputStreamReader(u.openConnection().getInputStream()));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a query that will return docs like the passed stream.
|
|
||||||
*
|
|
||||||
* @return a query that will return docs like the passed stream.
|
|
||||||
*/
|
|
||||||
public Query like(java.io.InputStream is) throws IOException {
|
|
||||||
return like(new InputStreamReader(is));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return a query that will return docs like the passed Reader.
|
* Return a query that will return docs like the passed Reader.
|
||||||
*
|
*
|
||||||
* @return a query that will return docs like the passed Reader.
|
* @return a query that will return docs like the passed Reader.
|
||||||
*/
|
*/
|
||||||
public Query like(Reader r) throws IOException {
|
public Query like(Reader r, String fieldName) throws IOException {
|
||||||
return createQuery(retrieveTerms(r));
|
return createQuery(retrieveTerms(r, fieldName));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -726,65 +693,6 @@ public final class MoreLikeThis {
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Test driver.
|
|
||||||
* Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
|
|
||||||
*/
|
|
||||||
public static void main(String[] a) throws Throwable {
|
|
||||||
String indexName = "localhost_index";
|
|
||||||
String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
|
|
||||||
URL url = null;
|
|
||||||
for (int i = 0; i < a.length; i++) {
|
|
||||||
if (a[i].equals("-i")) {
|
|
||||||
indexName = a[++i];
|
|
||||||
} else if (a[i].equals("-f")) {
|
|
||||||
fn = a[++i];
|
|
||||||
} else if (a[i].equals("-url")) {
|
|
||||||
url = new URL(a[++i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PrintStream o = System.out;
|
|
||||||
FSDirectory dir = FSDirectory.open(new File(indexName));
|
|
||||||
IndexReader r = IndexReader.open(dir, true);
|
|
||||||
o.println("Open index " + indexName + " which has " + r.numDocs() + " docs");
|
|
||||||
|
|
||||||
MoreLikeThis mlt = new MoreLikeThis(r);
|
|
||||||
|
|
||||||
o.println("Query generation parameters:");
|
|
||||||
o.println(mlt.describeParams());
|
|
||||||
o.println();
|
|
||||||
|
|
||||||
Query query = null;
|
|
||||||
if (url != null) {
|
|
||||||
o.println("Parsing URL: " + url);
|
|
||||||
query = mlt.like(url);
|
|
||||||
} else if (fn != null) {
|
|
||||||
o.println("Parsing file: " + fn);
|
|
||||||
query = mlt.like(new File(fn));
|
|
||||||
}
|
|
||||||
|
|
||||||
o.println("q: " + query);
|
|
||||||
o.println();
|
|
||||||
IndexSearcher searcher = new IndexSearcher(dir, true);
|
|
||||||
|
|
||||||
TopDocs hits = searcher.search(query, null, 25);
|
|
||||||
int len = hits.totalHits;
|
|
||||||
o.println("found: " + len + " documents matching");
|
|
||||||
o.println();
|
|
||||||
ScoreDoc[] scoreDocs = hits.scoreDocs;
|
|
||||||
for (int i = 0; i < Math.min(25, len); i++) {
|
|
||||||
Document d = searcher.doc(scoreDocs[i].doc);
|
|
||||||
String summary = d.get("summary");
|
|
||||||
o.println("score : " + scoreDocs[i].score);
|
|
||||||
o.println("url : " + d.get("url"));
|
|
||||||
o.println("\ttitle : " + d.get("title"));
|
|
||||||
if (summary != null)
|
|
||||||
o.println("\tsummary: " + d.get("summary"));
|
|
||||||
o.println();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find words for a more-like-this query former.
|
* Find words for a more-like-this query former.
|
||||||
*
|
*
|
||||||
|
@ -918,14 +826,13 @@ public final class MoreLikeThis {
|
||||||
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
|
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
|
||||||
*
|
*
|
||||||
* @param r the reader that has the content of the document
|
* @param r the reader that has the content of the document
|
||||||
|
* @param fieldName field passed to the analyzer to use when analyzing the content
|
||||||
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
|
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
|
||||||
* @see #retrieveInterestingTerms
|
* @see #retrieveInterestingTerms
|
||||||
*/
|
*/
|
||||||
public PriorityQueue<Object[]> retrieveTerms(Reader r) throws IOException {
|
public PriorityQueue<Object[]> retrieveTerms(Reader r, String fieldName) throws IOException {
|
||||||
Map<String, Int> words = new HashMap<String, Int>();
|
Map<String, Int> words = new HashMap<String, Int>();
|
||||||
for (String fieldName : fieldNames) {
|
addTermFrequencies(r, words, fieldName);
|
||||||
addTermFrequencies(r, words, fieldName);
|
|
||||||
}
|
|
||||||
return createQueue(words);
|
return createQueue(words);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -948,16 +855,17 @@ public final class MoreLikeThis {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convenience routine to make it easy to return the most interesting words in a document.
|
* Convenience routine to make it easy to return the most interesting words in a document.
|
||||||
* More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
|
* More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
|
||||||
*
|
*
|
||||||
* @param r the source document
|
* @param r the source document
|
||||||
|
* @param fieldName field passed to analyzer to use when analyzing the content
|
||||||
* @return the most interesting words in the document
|
* @return the most interesting words in the document
|
||||||
* @see #retrieveTerms(java.io.Reader)
|
* @see #retrieveTerms(java.io.Reader)
|
||||||
* @see #setMaxQueryTerms
|
* @see #setMaxQueryTerms
|
||||||
*/
|
*/
|
||||||
public String[] retrieveInterestingTerms(Reader r) throws IOException {
|
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
|
||||||
ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
|
ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
|
||||||
PriorityQueue<Object[]> pq = retrieveTerms(r);
|
PriorityQueue<Object[]> pq = retrieveTerms(r, fieldName);
|
||||||
Object cur;
|
Object cur;
|
||||||
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
||||||
// we just want to return the top words
|
// we just want to return the top words
|
||||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.search.Query;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.StringReader;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -40,6 +41,7 @@ public class MoreLikeThisQuery extends Query {
|
||||||
private String likeText;
|
private String likeText;
|
||||||
private String[] moreLikeFields;
|
private String[] moreLikeFields;
|
||||||
private Analyzer analyzer;
|
private Analyzer analyzer;
|
||||||
|
private String fieldName;
|
||||||
private float percentTermsToMatch = 0.3f;
|
private float percentTermsToMatch = 0.3f;
|
||||||
private int minTermFrequency = 1;
|
private int minTermFrequency = 1;
|
||||||
private int maxQueryTerms = 5;
|
private int maxQueryTerms = 5;
|
||||||
|
@ -49,10 +51,11 @@ public class MoreLikeThisQuery extends Query {
|
||||||
/**
|
/**
|
||||||
* @param moreLikeFields
|
* @param moreLikeFields
|
||||||
*/
|
*/
|
||||||
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
|
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer, String fieldName) {
|
||||||
this.likeText = likeText;
|
this.likeText = likeText;
|
||||||
this.moreLikeFields = moreLikeFields;
|
this.moreLikeFields = moreLikeFields;
|
||||||
this.analyzer = analyzer;
|
this.analyzer = analyzer;
|
||||||
|
this.fieldName = fieldName;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -67,7 +70,7 @@ public class MoreLikeThisQuery extends Query {
|
||||||
}
|
}
|
||||||
mlt.setMaxQueryTerms(maxQueryTerms);
|
mlt.setMaxQueryTerms(maxQueryTerms);
|
||||||
mlt.setStopWords(stopWords);
|
mlt.setStopWords(stopWords);
|
||||||
BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
|
BooleanQuery bq = (BooleanQuery) mlt.like(new StringReader(likeText), fieldName);
|
||||||
BooleanClause[] clauses = bq.getClauses();
|
BooleanClause[] clauses = bq.getClauses();
|
||||||
//make at least half the terms match
|
//make at least half the terms match
|
||||||
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||||
|
|
|
@ -87,7 +87,7 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
||||||
mlt.setBoostFactor(boostFactor);
|
mlt.setBoostFactor(boostFactor);
|
||||||
|
|
||||||
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
|
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
|
||||||
"lucene release"));
|
"lucene release"), "text");
|
||||||
List<BooleanClause> clauses = query.clauses();
|
List<BooleanClause> clauses = query.clauses();
|
||||||
|
|
||||||
assertEquals("Expected " + originalValues.size() + " clauses.",
|
assertEquals("Expected " + originalValues.size() + " clauses.",
|
||||||
|
@ -115,7 +115,7 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
||||||
mlt.setFieldNames(new String[] {"text"});
|
mlt.setFieldNames(new String[] {"text"});
|
||||||
mlt.setBoost(true);
|
mlt.setBoost(true);
|
||||||
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
|
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
|
||||||
"lucene release"));
|
"lucene release"), "text");
|
||||||
List<BooleanClause> clauses = query.clauses();
|
List<BooleanClause> clauses = query.clauses();
|
||||||
|
|
||||||
for (BooleanClause clause : clauses) {
|
for (BooleanClause clause : clauses) {
|
||||||
|
@ -124,4 +124,15 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
return originalValues;
|
return originalValues;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LUCENE-3326
|
||||||
|
public void testMultiFields() throws Exception {
|
||||||
|
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||||
|
mlt.setAnalyzer(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||||
|
mlt.setMinDocFreq(1);
|
||||||
|
mlt.setMinTermFreq(1);
|
||||||
|
mlt.setMinWordLen(1);
|
||||||
|
mlt.setFieldNames(new String[] {"text", "foobar"});
|
||||||
|
mlt.like(new StringReader("this is a test"), "foobar");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -367,7 +367,8 @@ public class MoreLikeThisHandler extends RequestHandlerBase
|
||||||
|
|
||||||
public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
|
public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
|
||||||
{
|
{
|
||||||
rawMLTQuery = mlt.like(reader);
|
// analyzing with the first field: previous (stupid) behavior
|
||||||
|
rawMLTQuery = mlt.like(reader, mlt.getFieldNames()[0]);
|
||||||
boostedMLTQuery = getBoostedQuery( rawMLTQuery );
|
boostedMLTQuery = getBoostedQuery( rawMLTQuery );
|
||||||
if( terms != null ) {
|
if( terms != null ) {
|
||||||
fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
|
fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
|
||||||
|
|
Loading…
Reference in New Issue