From 07bfe50eda6928b1b72174bcbf2cbcc007fe452e Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 18 Jul 2011 13:56:49 +0000 Subject: [PATCH] LUCENE-3326: MoreLikeThis reuses a reader after it has already closed it git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1147881 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 7 ++ .../builders/LikeThisQueryBuilder.java | 2 +- .../lucene/queries/mlt/MoreLikeThis.java | 110 ++---------------- .../lucene/queries/mlt/MoreLikeThisQuery.java | 7 +- .../lucene/queries/mlt/TestMoreLikeThis.java | 15 ++- .../solr/handler/MoreLikeThisHandler.java | 3 +- 6 files changed, 37 insertions(+), 107 deletions(-) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index b26d5753793..313ea2782d0 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -83,6 +83,7 @@ New Features Removed contrib/wordnet. (Robert Muir, Mike McCandless) API Changes + * LUCENE-3296: PKIndexSplitter & MultiPassIndexSplitter now have version constructors. PKIndexSplitter accepts a IndexWriterConfig for each of the target indexes. (Simon Willnauer, Jason Rutherglen) @@ -95,6 +96,12 @@ Optimizations Bug Fixes + * LUCENE-3326: Fixed bug if you used MoreLikeThis.like(Reader), it would + try to re-analyze the same Reader multiple times, passing different + field names to the analyzer. Additionally MoreLikeThisQuery would take + your string and encode/decode it using the default charset, it now uses + a StringReader. Finally, MoreLikeThis's methods that take File, URL, InputStream, + are deprecated, please create the Reader yourself. (Trejkaz, Robert Muir) ======================= Lucene 3.3.0 ======================= diff --git a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java index 4fac9b04d5a..179051e282a 100644 --- a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java +++ b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/LikeThisQueryBuilder.java @@ -96,7 +96,7 @@ public class LikeThisQueryBuilder implements QueryBuilder { } - MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer); + MoreLikeThisQuery mlt=new MoreLikeThisQuery(DOMUtils.getText(e),fields,analyzer, fields[0]); mlt.setMaxQueryTerms(DOMUtils.getAttribute(e,"maxQueryTerms",defaultMaxQueryTerms)); mlt.setMinTermFrequency(DOMUtils.getAttribute(e,"minTermFrequency",defaultMinTermFrequency)); mlt.setPercentTermsToMatch(DOMUtils.getAttribute(e,"percentTermsToMatch",defaultPercentTermsToMatch)/100); diff --git a/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java index 3134826b7ac..7489034cf21 100644 --- a/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java +++ b/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java @@ -573,46 +573,13 @@ public final class MoreLikeThis { return createQuery(retrieveTerms(docNum)); } - /** - * Return a query that will return docs like the passed file. - * - * @return a query that will return docs like the passed file. - */ - public Query like(File f) throws IOException { - if (fieldNames == null) { - // gather list of valid fields from lucene - Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED); - fieldNames = fields.toArray(new String[fields.size()]); - } - - return like(new FileReader(f)); - } - - /** - * Return a query that will return docs like the passed URL. - * - * @return a query that will return docs like the passed URL. - */ - public Query like(URL u) throws IOException { - return like(new InputStreamReader(u.openConnection().getInputStream())); - } - - /** - * Return a query that will return docs like the passed stream. - * - * @return a query that will return docs like the passed stream. - */ - public Query like(java.io.InputStream is) throws IOException { - return like(new InputStreamReader(is)); - } - /** * Return a query that will return docs like the passed Reader. * * @return a query that will return docs like the passed Reader. */ - public Query like(Reader r) throws IOException { - return createQuery(retrieveTerms(r)); + public Query like(Reader r, String fieldName) throws IOException { + return createQuery(retrieveTerms(r, fieldName)); } /** @@ -726,65 +693,6 @@ public final class MoreLikeThis { return sb.toString(); } - /** - * Test driver. - * Pass in "-i INDEX" and then either "-fn FILE" or "-url URL". - */ - public static void main(String[] a) throws Throwable { - String indexName = "localhost_index"; - String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en"; - URL url = null; - for (int i = 0; i < a.length; i++) { - if (a[i].equals("-i")) { - indexName = a[++i]; - } else if (a[i].equals("-f")) { - fn = a[++i]; - } else if (a[i].equals("-url")) { - url = new URL(a[++i]); - } - } - - PrintStream o = System.out; - FSDirectory dir = FSDirectory.open(new File(indexName)); - IndexReader r = IndexReader.open(dir, true); - o.println("Open index " + indexName + " which has " + r.numDocs() + " docs"); - - MoreLikeThis mlt = new MoreLikeThis(r); - - o.println("Query generation parameters:"); - o.println(mlt.describeParams()); - o.println(); - - Query query = null; - if (url != null) { - o.println("Parsing URL: " + url); - query = mlt.like(url); - } else if (fn != null) { - o.println("Parsing file: " + fn); - query = mlt.like(new File(fn)); - } - - o.println("q: " + query); - o.println(); - IndexSearcher searcher = new IndexSearcher(dir, true); - - TopDocs hits = searcher.search(query, null, 25); - int len = hits.totalHits; - o.println("found: " + len + " documents matching"); - o.println(); - ScoreDoc[] scoreDocs = hits.scoreDocs; - for (int i = 0; i < Math.min(25, len); i++) { - Document d = searcher.doc(scoreDocs[i].doc); - String summary = d.get("summary"); - o.println("score : " + scoreDocs[i].score); - o.println("url : " + d.get("url")); - o.println("\ttitle : " + d.get("title")); - if (summary != null) - o.println("\tsummary: " + d.get("summary")); - o.println(); - } - } - /** * Find words for a more-like-this query former. * @@ -918,14 +826,13 @@ public final class MoreLikeThis { * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. * * @param r the reader that has the content of the document + * @param fieldName field passed to the analyzer to use when analyzing the content * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first * @see #retrieveInterestingTerms */ - public PriorityQueue retrieveTerms(Reader r) throws IOException { + public PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { Map words = new HashMap(); - for (String fieldName : fieldNames) { - addTermFrequencies(r, words, fieldName); - } + addTermFrequencies(r, words, fieldName); return createQueue(words); } @@ -948,16 +855,17 @@ public final class MoreLikeThis { /** * Convenience routine to make it easy to return the most interesting words in a document. - * More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly. + * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. * * @param r the source document + * @param fieldName field passed to analyzer to use when analyzing the content * @return the most interesting words in the document * @see #retrieveTerms(java.io.Reader) * @see #setMaxQueryTerms */ - public String[] retrieveInterestingTerms(Reader r) throws IOException { + public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { ArrayList al = new ArrayList(maxQueryTerms); - PriorityQueue pq = retrieveTerms(r); + PriorityQueue pq = retrieveTerms(r, fieldName); Object cur; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words diff --git a/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java b/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java index c2efef7b872..d26efb5cc9f 100644 --- a/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java +++ b/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThisQuery.java @@ -28,6 +28,7 @@ import org.apache.lucene.search.Query; import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.StringReader; import java.util.Set; /** @@ -40,6 +41,7 @@ public class MoreLikeThisQuery extends Query { private String likeText; private String[] moreLikeFields; private Analyzer analyzer; + private String fieldName; private float percentTermsToMatch = 0.3f; private int minTermFrequency = 1; private int maxQueryTerms = 5; @@ -49,10 +51,11 @@ public class MoreLikeThisQuery extends Query { /** * @param moreLikeFields */ - public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) { + public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer, String fieldName) { this.likeText = likeText; this.moreLikeFields = moreLikeFields; this.analyzer = analyzer; + this.fieldName = fieldName; } @Override @@ -67,7 +70,7 @@ public class MoreLikeThisQuery extends Query { } mlt.setMaxQueryTerms(maxQueryTerms); mlt.setStopWords(stopWords); - BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes())); + BooleanQuery bq = (BooleanQuery) mlt.like(new StringReader(likeText), fieldName); BooleanClause[] clauses = bq.getClauses(); //make at least half the terms match bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); diff --git a/modules/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/modules/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index bb6fe6c5f17..38d9edadf8a 100644 --- a/modules/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/modules/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -87,7 +87,7 @@ public class TestMoreLikeThis extends LuceneTestCase { mlt.setBoostFactor(boostFactor); BooleanQuery query = (BooleanQuery) mlt.like(new StringReader( - "lucene release")); + "lucene release"), "text"); List clauses = query.clauses(); assertEquals("Expected " + originalValues.size() + " clauses.", @@ -115,7 +115,7 @@ public class TestMoreLikeThis extends LuceneTestCase { mlt.setFieldNames(new String[] {"text"}); mlt.setBoost(true); BooleanQuery query = (BooleanQuery) mlt.like(new StringReader( - "lucene release")); + "lucene release"), "text"); List clauses = query.clauses(); for (BooleanClause clause : clauses) { @@ -124,4 +124,15 @@ public class TestMoreLikeThis extends LuceneTestCase { } return originalValues; } + + // LUCENE-3326 + public void testMultiFields() throws Exception { + MoreLikeThis mlt = new MoreLikeThis(reader); + mlt.setAnalyzer(new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); + mlt.setMinDocFreq(1); + mlt.setMinTermFreq(1); + mlt.setMinWordLen(1); + mlt.setFieldNames(new String[] {"text", "foobar"}); + mlt.like(new StringReader("this is a test"), "foobar"); + } } diff --git a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java index 0f7598276b3..155d76f3bb9 100644 --- a/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/MoreLikeThisHandler.java @@ -367,7 +367,8 @@ public class MoreLikeThisHandler extends RequestHandlerBase public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List filters, List terms, int flags ) throws IOException { - rawMLTQuery = mlt.like(reader); + // analyzing with the first field: previous (stupid) behavior + rawMLTQuery = mlt.like(reader, mlt.getFieldNames()[0]); boostedMLTQuery = getBoostedQuery( rawMLTQuery ); if( terms != null ) { fillInterestingTermsFromMLTQuery( boostedMLTQuery, terms );