LUCENE-3326: MoreLikeThis reuses a reader after it has already closed it

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1147881 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-07-18 13:56:49 +00:00
parent 1a9c947c2d
commit 07bfe50eda
6 changed files with 37 additions and 107 deletions

View File

@ -83,6 +83,7 @@ New Features
Removed contrib/wordnet. (Robert Muir, Mike McCandless)
API Changes
* LUCENE-3296: PKIndexSplitter & MultiPassIndexSplitter now have version
constructors. PKIndexSplitter accepts a IndexWriterConfig for each of
the target indexes. (Simon Willnauer, Jason Rutherglen)
@ -95,6 +96,12 @@ Optimizations
Bug Fixes
* LUCENE-3326: Fixed bug if you used MoreLikeThis.like(Reader), it would
try to re-analyze the same Reader multiple times, passing different
field names to the analyzer. Additionally MoreLikeThisQuery would take
your string and encode/decode it using the default charset, it now uses
a StringReader. Finally, MoreLikeThis's methods that take File, URL, InputStream,
are deprecated, please create the Reader yourself. (Trejkaz, Robert Muir)
======================= Lucene 3.3.0 =======================

View File

@ -96,7 +96,7 @@ public class LikeThisQueryBuilder implements QueryBuilder {
}
MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer);
MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer, fields[0]);
mlt.setMaxQueryTerms(DOMUtils.getAttribute(e,"maxQueryTerms",defaultMaxQueryTerms));
mlt.setMinTermFrequency(DOMUtils.getAttribute(e,"minTermFrequency",defaultMinTermFrequency));
mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e,"percentTermsToMatch",defaultPercentTermsToMatch)/100);

View File

@ -573,46 +573,13 @@ public final class MoreLikeThis {
return createQuery(retrieveTerms(docNum));
}
/**
* Return a query that will return docs like the passed file.
*
* @return a query that will return docs like the passed file.
*/
public Query like(File f) throws IOException {
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection<String> fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
fieldNames = fields.toArray(new String[fields.size()]);
}
return like(new FileReader(f));
}
/**
* Return a query that will return docs like the passed URL.
*
* @return a query that will return docs like the passed URL.
*/
public Query like(URL u) throws IOException {
return like(new InputStreamReader(u.openConnection().getInputStream()));
}
/**
* Return a query that will return docs like the passed stream.
*
* @return a query that will return docs like the passed stream.
*/
public Query like(java.io.InputStream is) throws IOException {
return like(new InputStreamReader(is));
}
/**
* Return a query that will return docs like the passed Reader.
*
* @return a query that will return docs like the passed Reader.
*/
public Query like(Reader r) throws IOException {
return createQuery(retrieveTerms(r));
public Query like(Reader r, String fieldName) throws IOException {
return createQuery(retrieveTerms(r, fieldName));
}
/**
@ -726,65 +693,6 @@ public final class MoreLikeThis {
return sb.toString();
}
/**
* Test driver.
* Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
*/
public static void main(String[] a) throws Throwable {
String indexName = "localhost_index";
String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
URL url = null;
for (int i = 0; i < a.length; i++) {
if (a[i].equals("-i")) {
indexName = a[++i];
} else if (a[i].equals("-f")) {
fn = a[++i];
} else if (a[i].equals("-url")) {
url = new URL(a[++i]);
}
}
PrintStream o = System.out;
FSDirectory dir = FSDirectory.open(new File(indexName));
IndexReader r = IndexReader.open(dir, true);
o.println("Open index " + indexName + " which has " + r.numDocs() + " docs");
MoreLikeThis mlt = new MoreLikeThis(r);
o.println("Query generation parameters:");
o.println(mlt.describeParams());
o.println();
Query query = null;
if (url != null) {
o.println("Parsing URL: " + url);
query = mlt.like(url);
} else if (fn != null) {
o.println("Parsing file: " + fn);
query = mlt.like(new File(fn));
}
o.println("q: " + query);
o.println();
IndexSearcher searcher = new IndexSearcher(dir, true);
TopDocs hits = searcher.search(query, null, 25);
int len = hits.totalHits;
o.println("found: " + len + " documents matching");
o.println();
ScoreDoc[] scoreDocs = hits.scoreDocs;
for (int i = 0; i < Math.min(25, len); i++) {
Document d = searcher.doc(scoreDocs[i].doc);
String summary = d.get("summary");
o.println("score : " + scoreDocs[i].score);
o.println("url : " + d.get("url"));
o.println("\ttitle : " + d.get("title"));
if (summary != null)
o.println("\tsummary: " + d.get("summary"));
o.println();
}
}
/**
* Find words for a more-like-this query former.
*
@ -918,14 +826,13 @@ public final class MoreLikeThis {
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
*
* @param r the reader that has the content of the document
* @param fieldName field passed to the analyzer to use when analyzing the content
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
* @see #retrieveInterestingTerms
*/
public PriorityQueue<Object[]> retrieveTerms(Reader r) throws IOException {
public PriorityQueue<Object[]> retrieveTerms(Reader r, String fieldName) throws IOException {
Map<String, Int> words = new HashMap<String, Int>();
for (String fieldName : fieldNames) {
addTermFrequencies(r, words, fieldName);
}
return createQueue(words);
}
@ -948,16 +855,17 @@ public final class MoreLikeThis {
/**
* Convenience routine to make it easy to return the most interesting words in a document.
* More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
* More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
*
* @param r the source document
* @param fieldName field passed to analyzer to use when analyzing the content
* @return the most interesting words in the document
* @see #retrieveTerms(java.io.Reader)
* @see #setMaxQueryTerms
*/
public String[] retrieveInterestingTerms(Reader r) throws IOException {
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
PriorityQueue<Object[]> pq = retrieveTerms(r);
PriorityQueue<Object[]> pq = retrieveTerms(r, fieldName);
Object cur;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words

View File

@ -28,6 +28,7 @@ import org.apache.lucene.search.Query;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.util.Set;
/**
@ -40,6 +41,7 @@ public class MoreLikeThisQuery extends Query {
private String likeText;
private String[] moreLikeFields;
private Analyzer analyzer;
private String fieldName;
private float percentTermsToMatch = 0.3f;
private int minTermFrequency = 1;
private int maxQueryTerms = 5;
@ -49,10 +51,11 @@ public class MoreLikeThisQuery extends Query {
/**
* @param moreLikeFields
*/
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer, String fieldName) {
this.likeText = likeText;
this.moreLikeFields = moreLikeFields;
this.analyzer = analyzer;
this.fieldName = fieldName;
}
@Override
@ -67,7 +70,7 @@ public class MoreLikeThisQuery extends Query {
}
mlt.setMaxQueryTerms(maxQueryTerms);
mlt.setStopWords(stopWords);
BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
BooleanQuery bq = (BooleanQuery) mlt.like(new StringReader(likeText), fieldName);
BooleanClause[] clauses = bq.getClauses();
//make at least half the terms match
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));

View File

@ -87,7 +87,7 @@ public class TestMoreLikeThis extends LuceneTestCase {
mlt.setBoostFactor(boostFactor);
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
"lucene release"));
"lucene release"), "text");
List<BooleanClause> clauses = query.clauses();
assertEquals("Expected " + originalValues.size() + " clauses.",
@ -115,7 +115,7 @@ public class TestMoreLikeThis extends LuceneTestCase {
mlt.setFieldNames(new String[] {"text"});
mlt.setBoost(true);
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
"lucene release"));
"lucene release"), "text");
List<BooleanClause> clauses = query.clauses();
for (BooleanClause clause : clauses) {
@ -124,4 +124,15 @@ public class TestMoreLikeThis extends LuceneTestCase {
}
return originalValues;
}
// LUCENE-3326
public void testMultiFields() throws Exception {
MoreLikeThis mlt = new MoreLikeThis(reader);
mlt.setAnalyzer(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
mlt.setMinDocFreq(1);
mlt.setMinTermFreq(1);
mlt.setMinWordLen(1);
mlt.setFieldNames(new String[] {"text", "foobar"});
mlt.like(new StringReader("this is a test"), "foobar");
}
}

View File

@ -367,7 +367,8 @@ public class MoreLikeThisHandler extends RequestHandlerBase
public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
{
rawMLTQuery = mlt.like(reader);
// analyzing with the first field: previous (stupid) behavior
rawMLTQuery = mlt.like(reader, mlt.getFieldNames()[0]);
boostedMLTQuery = getBoostedQuery( rawMLTQuery );
if( terms != null ) {
fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );