mirror of https://github.com/apache/lucene.git
LUCENE-3326: MoreLikeThis reuses a reader after it has already closed it
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1147881 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1a9c947c2d
commit
07bfe50eda
|
@ -83,6 +83,7 @@ New Features
|
|||
Removed contrib/wordnet. (Robert Muir, Mike McCandless)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-3296: PKIndexSplitter & MultiPassIndexSplitter now have version
|
||||
constructors. PKIndexSplitter accepts a IndexWriterConfig for each of
|
||||
the target indexes. (Simon Willnauer, Jason Rutherglen)
|
||||
|
@ -95,6 +96,12 @@ Optimizations
|
|||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-3326: Fixed bug if you used MoreLikeThis.like(Reader), it would
|
||||
try to re-analyze the same Reader multiple times, passing different
|
||||
field names to the analyzer. Additionally MoreLikeThisQuery would take
|
||||
your string and encode/decode it using the default charset, it now uses
|
||||
a StringReader. Finally, MoreLikeThis's methods that take File, URL, InputStream,
|
||||
are deprecated, please create the Reader yourself. (Trejkaz, Robert Muir)
|
||||
|
||||
======================= Lucene 3.3.0 =======================
|
||||
|
||||
|
|
|
@ -96,7 +96,7 @@ public class LikeThisQueryBuilder implements QueryBuilder {
|
|||
}
|
||||
|
||||
|
||||
MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer);
|
||||
MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer, fields[0]);
|
||||
mlt.setMaxQueryTerms(DOMUtils.getAttribute(e,"maxQueryTerms",defaultMaxQueryTerms));
|
||||
mlt.setMinTermFrequency(DOMUtils.getAttribute(e,"minTermFrequency",defaultMinTermFrequency));
|
||||
mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e,"percentTermsToMatch",defaultPercentTermsToMatch)/100);
|
||||
|
|
|
@ -573,46 +573,13 @@ public final class MoreLikeThis {
|
|||
return createQuery(retrieveTerms(docNum));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed file.
|
||||
*
|
||||
* @return a query that will return docs like the passed file.
|
||||
*/
|
||||
public Query like(File f) throws IOException {
|
||||
if (fieldNames == null) {
|
||||
// gather list of valid fields from lucene
|
||||
Collection<String> fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
|
||||
fieldNames = fields.toArray(new String[fields.size()]);
|
||||
}
|
||||
|
||||
return like(new FileReader(f));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed URL.
|
||||
*
|
||||
* @return a query that will return docs like the passed URL.
|
||||
*/
|
||||
public Query like(URL u) throws IOException {
|
||||
return like(new InputStreamReader(u.openConnection().getInputStream()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed stream.
|
||||
*
|
||||
* @return a query that will return docs like the passed stream.
|
||||
*/
|
||||
public Query like(java.io.InputStream is) throws IOException {
|
||||
return like(new InputStreamReader(is));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed Reader.
|
||||
*
|
||||
* @return a query that will return docs like the passed Reader.
|
||||
*/
|
||||
public Query like(Reader r) throws IOException {
|
||||
return createQuery(retrieveTerms(r));
|
||||
public Query like(Reader r, String fieldName) throws IOException {
|
||||
return createQuery(retrieveTerms(r, fieldName));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -726,65 +693,6 @@ public final class MoreLikeThis {
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test driver.
|
||||
* Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
|
||||
*/
|
||||
public static void main(String[] a) throws Throwable {
|
||||
String indexName = "localhost_index";
|
||||
String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
|
||||
URL url = null;
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
if (a[i].equals("-i")) {
|
||||
indexName = a[++i];
|
||||
} else if (a[i].equals("-f")) {
|
||||
fn = a[++i];
|
||||
} else if (a[i].equals("-url")) {
|
||||
url = new URL(a[++i]);
|
||||
}
|
||||
}
|
||||
|
||||
PrintStream o = System.out;
|
||||
FSDirectory dir = FSDirectory.open(new File(indexName));
|
||||
IndexReader r = IndexReader.open(dir, true);
|
||||
o.println("Open index " + indexName + " which has " + r.numDocs() + " docs");
|
||||
|
||||
MoreLikeThis mlt = new MoreLikeThis(r);
|
||||
|
||||
o.println("Query generation parameters:");
|
||||
o.println(mlt.describeParams());
|
||||
o.println();
|
||||
|
||||
Query query = null;
|
||||
if (url != null) {
|
||||
o.println("Parsing URL: " + url);
|
||||
query = mlt.like(url);
|
||||
} else if (fn != null) {
|
||||
o.println("Parsing file: " + fn);
|
||||
query = mlt.like(new File(fn));
|
||||
}
|
||||
|
||||
o.println("q: " + query);
|
||||
o.println();
|
||||
IndexSearcher searcher = new IndexSearcher(dir, true);
|
||||
|
||||
TopDocs hits = searcher.search(query, null, 25);
|
||||
int len = hits.totalHits;
|
||||
o.println("found: " + len + " documents matching");
|
||||
o.println();
|
||||
ScoreDoc[] scoreDocs = hits.scoreDocs;
|
||||
for (int i = 0; i < Math.min(25, len); i++) {
|
||||
Document d = searcher.doc(scoreDocs[i].doc);
|
||||
String summary = d.get("summary");
|
||||
o.println("score : " + scoreDocs[i].score);
|
||||
o.println("url : " + d.get("url"));
|
||||
o.println("\ttitle : " + d.get("title"));
|
||||
if (summary != null)
|
||||
o.println("\tsummary: " + d.get("summary"));
|
||||
o.println();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Find words for a more-like-this query former.
|
||||
*
|
||||
|
@ -918,14 +826,13 @@ public final class MoreLikeThis {
|
|||
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
|
||||
*
|
||||
* @param r the reader that has the content of the document
|
||||
* @param fieldName field passed to the analyzer to use when analyzing the content
|
||||
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
|
||||
* @see #retrieveInterestingTerms
|
||||
*/
|
||||
public PriorityQueue<Object[]> retrieveTerms(Reader r) throws IOException {
|
||||
public PriorityQueue<Object[]> retrieveTerms(Reader r, String fieldName) throws IOException {
|
||||
Map<String, Int> words = new HashMap<String, Int>();
|
||||
for (String fieldName : fieldNames) {
|
||||
addTermFrequencies(r, words, fieldName);
|
||||
}
|
||||
addTermFrequencies(r, words, fieldName);
|
||||
return createQueue(words);
|
||||
}
|
||||
|
||||
|
@ -948,16 +855,17 @@ public final class MoreLikeThis {
|
|||
|
||||
/**
|
||||
* Convenience routine to make it easy to return the most interesting words in a document.
|
||||
* More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
|
||||
* More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly.
|
||||
*
|
||||
* @param r the source document
|
||||
* @param fieldName field passed to analyzer to use when analyzing the content
|
||||
* @return the most interesting words in the document
|
||||
* @see #retrieveTerms(java.io.Reader)
|
||||
* @see #setMaxQueryTerms
|
||||
*/
|
||||
public String[] retrieveInterestingTerms(Reader r) throws IOException {
|
||||
public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException {
|
||||
ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
|
||||
PriorityQueue<Object[]> pq = retrieveTerms(r);
|
||||
PriorityQueue<Object[]> pq = retrieveTerms(r, fieldName);
|
||||
Object cur;
|
||||
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
||||
// we just want to return the top words
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.search.Query;
|
|||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
|
@ -40,6 +41,7 @@ public class MoreLikeThisQuery extends Query {
|
|||
private String likeText;
|
||||
private String[] moreLikeFields;
|
||||
private Analyzer analyzer;
|
||||
private String fieldName;
|
||||
private float percentTermsToMatch = 0.3f;
|
||||
private int minTermFrequency = 1;
|
||||
private int maxQueryTerms = 5;
|
||||
|
@ -49,10 +51,11 @@ public class MoreLikeThisQuery extends Query {
|
|||
/**
|
||||
* @param moreLikeFields
|
||||
*/
|
||||
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
|
||||
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer, String fieldName) {
|
||||
this.likeText = likeText;
|
||||
this.moreLikeFields = moreLikeFields;
|
||||
this.analyzer = analyzer;
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -67,7 +70,7 @@ public class MoreLikeThisQuery extends Query {
|
|||
}
|
||||
mlt.setMaxQueryTerms(maxQueryTerms);
|
||||
mlt.setStopWords(stopWords);
|
||||
BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
|
||||
BooleanQuery bq = (BooleanQuery) mlt.like(new StringReader(likeText), fieldName);
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
//make at least half the terms match
|
||||
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||
|
|
|
@ -87,7 +87,7 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
mlt.setBoostFactor(boostFactor);
|
||||
|
||||
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
|
||||
"lucene release"));
|
||||
"lucene release"), "text");
|
||||
List<BooleanClause> clauses = query.clauses();
|
||||
|
||||
assertEquals("Expected " + originalValues.size() + " clauses.",
|
||||
|
@ -115,7 +115,7 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
mlt.setFieldNames(new String[] {"text"});
|
||||
mlt.setBoost(true);
|
||||
BooleanQuery query = (BooleanQuery) mlt.like(new StringReader(
|
||||
"lucene release"));
|
||||
"lucene release"), "text");
|
||||
List<BooleanClause> clauses = query.clauses();
|
||||
|
||||
for (BooleanClause clause : clauses) {
|
||||
|
@ -124,4 +124,15 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
}
|
||||
return originalValues;
|
||||
}
|
||||
|
||||
// LUCENE-3326
|
||||
public void testMultiFields() throws Exception {
|
||||
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||
mlt.setAnalyzer(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
mlt.setMinDocFreq(1);
|
||||
mlt.setMinTermFreq(1);
|
||||
mlt.setMinWordLen(1);
|
||||
mlt.setFieldNames(new String[] {"text", "foobar"});
|
||||
mlt.like(new StringReader("this is a test"), "foobar");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -367,7 +367,8 @@ public class MoreLikeThisHandler extends RequestHandlerBase
|
|||
|
||||
public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
|
||||
{
|
||||
rawMLTQuery = mlt.like(reader);
|
||||
// analyzing with the first field: previous (stupid) behavior
|
||||
rawMLTQuery = mlt.like(reader, mlt.getFieldNames()[0]);
|
||||
boostedMLTQuery = getBoostedQuery( rawMLTQuery );
|
||||
if( terms != null ) {
|
||||
fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );
|
||||
|
|
Loading…
Reference in New Issue