diff --git a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java index ef4ba65e9d0..32110220fcb 100644 --- a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java +++ b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java @@ -21,7 +21,6 @@ package org.elasticsearch.common.lucene.search; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.queries.mlt.MoreLikeThis; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; @@ -31,6 +30,7 @@ import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.elasticsearch.common.io.FastStringReader; import java.io.IOException; +import java.io.Reader; import java.util.Arrays; import java.util.Set; @@ -43,18 +43,18 @@ public class MoreLikeThisQuery extends Query { private TFIDFSimilarity similarity; - private String likeText; + private String[] likeText; private String[] moreLikeFields; private Analyzer analyzer; private float percentTermsToMatch = DEFAULT_PERCENT_TERMS_TO_MATCH; - private int minTermFrequency = MoreLikeThis.DEFAULT_MIN_TERM_FREQ; - private int maxQueryTerms = MoreLikeThis.DEFAULT_MAX_QUERY_TERMS; - private Set stopWords = MoreLikeThis.DEFAULT_STOP_WORDS; - private int minDocFreq = MoreLikeThis.DEFAULT_MIN_DOC_FREQ; - private int maxDocFreq = MoreLikeThis.DEFAULT_MAX_DOC_FREQ; - private int minWordLen = MoreLikeThis.DEFAULT_MIN_WORD_LENGTH; - private int maxWordLen = MoreLikeThis.DEFAULT_MAX_WORD_LENGTH; - private boolean boostTerms = MoreLikeThis.DEFAULT_BOOST; + private int minTermFrequency = XMoreLikeThis.DEFAULT_MIN_TERM_FREQ; + private int maxQueryTerms = XMoreLikeThis.DEFAULT_MAX_QUERY_TERMS; + private Set stopWords = XMoreLikeThis.DEFAULT_STOP_WORDS; + private int minDocFreq = XMoreLikeThis.DEFAULT_MIN_DOC_FREQ; + private int maxDocFreq = XMoreLikeThis.DEFAULT_MAX_DOC_FREQ; + private int minWordLen = XMoreLikeThis.DEFAULT_MIN_WORD_LENGTH; + private int maxWordLen = XMoreLikeThis.DEFAULT_MAX_WORD_LENGTH; + private boolean boostTerms = XMoreLikeThis.DEFAULT_BOOST; private float boostTermsFactor = 1; @@ -63,7 +63,7 @@ public class MoreLikeThisQuery extends Query { } public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) { - this.likeText = likeText; + this.likeText = new String[]{likeText}; this.moreLikeFields = moreLikeFields; this.analyzer = analyzer; } @@ -72,7 +72,7 @@ public class MoreLikeThisQuery extends Query { public int hashCode() { int result = boostTerms ? 1 : 0; result = 31 * result + Float.floatToIntBits(boostTermsFactor); - result = 31 * result + likeText.hashCode(); + result = 31 * result + Arrays.hashCode(likeText); result = 31 * result + maxDocFreq; result = 31 * result + maxQueryTerms; result = 31 * result + maxWordLen; @@ -99,7 +99,7 @@ public class MoreLikeThisQuery extends Query { return false; if (boostTermsFactor != other.boostTermsFactor) return false; - if (!likeText.equals(other.likeText)) + if (!(Arrays.equals(likeText, other.likeText))) return false; if (maxDocFreq != other.maxDocFreq) return false; @@ -132,7 +132,7 @@ public class MoreLikeThisQuery extends Query { @Override public Query rewrite(IndexReader reader) throws IOException { - MoreLikeThis mlt = new MoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity); + XMoreLikeThis mlt = new XMoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity); mlt.setFieldNames(moreLikeFields); mlt.setAnalyzer(analyzer); @@ -145,10 +145,15 @@ public class MoreLikeThisQuery extends Query { mlt.setStopWords(stopWords); mlt.setBoost(boostTerms); mlt.setBoostFactor(boostTermsFactor); - //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) - BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText), moreLikeFields[0]); - BooleanClause[] clauses = bq.getClauses(); + Reader[] readers = new Reader[likeText.length]; + for (int i = 0; i < readers.length; i++) { + readers[i] = new FastStringReader(likeText[i]); + } + //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field) + BooleanQuery bq = (BooleanQuery) mlt.like(moreLikeFields[0], readers); + + BooleanClause[] clauses = bq.getClauses(); bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); bq.setBoost(getBoost()); @@ -157,14 +162,22 @@ public class MoreLikeThisQuery extends Query { @Override public String toString(String field) { - return "like:" + likeText; + return "like:" + Arrays.toString(likeText); } public String getLikeText() { + return (likeText == null ? null : likeText[0]); + } + + public String[] getLikeTexts() { return likeText; } public void setLikeText(String likeText) { + this.likeText = new String[]{likeText}; + } + + public void setLikeText(String... likeText) { this.likeText = likeText; } diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java b/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java new file mode 100644 index 00000000000..f2314afe3b5 --- /dev/null +++ b/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java @@ -0,0 +1,964 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * Copyright 2004-2005 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.elasticsearch.common.lucene.search; + +import java.io.*; +import java.util.*; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.*; +import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.UnicodeUtil; +import org.elasticsearch.Version; +import org.elasticsearch.common.io.FastStringReader; + +/** + * Generate "more like this" similarity queries. + * Based on this mail: + *
+ * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
+ * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
+ * is usually fast enough.  But looking up the docFreq() of every term in the document is
+ * probably too slow.
+ * 

+ * You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much, + * or at all. Since you're trying to maximize a tf*idf score, you're probably most interested + * in terms with a high tf. Choosing a tf threshold even as low as two or three will radically + * reduce the number of terms under consideration. Another heuristic is that terms with a + * high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the + * number of characters, not selecting anything less than, e.g., six or seven characters. + * With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms + * that do a pretty good job of characterizing a document. + *

+ * It all depends on what you're trying to do. If you're trying to eek out that last percent + * of precision and recall regardless of computational difficulty so that you can win a TREC + * competition, then the techniques I mention above are useless. But if you're trying to + * provide a "more like this" button on a search results page that does a decent job and has + * good performance, such techniques might be useful. + *

+ * An efficient, effective "more-like-this" query generator would be a great contribution, if + * anyone's interested. I'd imagine that it would take a Reader or a String (the document's + * text), analyzer Analyzer, and return a set of representative terms using heuristics like those + * above. The frequency and length thresholds could be parameters, etc. + *

+ * Doug + *

+ *

+ *

+ *

+ *

Initial Usage

+ *

+ * This class has lots of options to try to make it efficient and flexible. + * The simplest possible usage is as follows. The bold + * fragment is specific to this class. + *

+ *

+ * 

+ * IndexReader ir = ... + * IndexSearcher is = ... + *

+ * MoreLikeThis mlt = new MoreLikeThis(ir); + * Reader target = ... // orig source of doc you want to find similarities to + * Query query = mlt.like( target); + *

+ * Hits hits = is.search(query); + * // now the usual iteration thru 'hits' - the only thing to watch for is to make sure + * //you ignore the doc if it matches your 'target' document, as it should be similar to itself + *

+ *

+ *

+ * Thus you: + *

    + *
  1. do your normal, Lucene setup for searching, + *
  2. create a MoreLikeThis, + *
  3. get the text of the doc you want to find similarities to + *
  4. then call one of the like() calls to generate a similarity query + *
  5. call the searcher to find the similar docs + *
+ *

+ *

More Advanced Usage

+ *

+ * You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine + * multiple fields (e.g. body and title) for similarity. + *

+ *

+ * Depending on the size of your index and the size and makeup of your documents you + * may want to call the other set methods to control how the similarity queries are + * generated: + *

+ *

+ *


+ *
+ * Changes: Mark Harwood 29/02/04
+ * Some bugfixing, some refactoring, some optimisation.
+ * - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
+ * - bugfix: No significant terms being created for fields with a termvector - because
+ * was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
+ * - refactor: moved common code into isNoiseWord()
+ * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
+ * 
+ */ + +public final class XMoreLikeThis { + + static { + assert Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_48: "Remove this class once we upgrade to Lucene 4.9"; + } + + /** + * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. + * + * @see #getMaxNumTokensParsed + */ + public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; + + /** + * Ignore terms with less than this frequency in the source doc. + * + * @see #getMinTermFreq + * @see #setMinTermFreq + */ + public static final int DEFAULT_MIN_TERM_FREQ = 2; + + /** + * Ignore words which do not occur in at least this many docs. + * + * @see #getMinDocFreq + * @see #setMinDocFreq + */ + public static final int DEFAULT_MIN_DOC_FREQ = 5; + + /** + * Ignore words which occur in more than this many docs. + * + * @see #getMaxDocFreq + * @see #setMaxDocFreq + * @see #setMaxDocFreqPct + */ + public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; + + /** + * Boost terms in query based on score. + * + * @see #isBoost + * @see #setBoost + */ + public static final boolean DEFAULT_BOOST = false; + + /** + * Default field names. Null is used to specify that the field names should be looked + * up at runtime from the provided reader. + */ + public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; + + /** + * Ignore words less than this length or if 0 then this has no effect. + * + * @see #getMinWordLen + * @see #setMinWordLen + */ + public static final int DEFAULT_MIN_WORD_LENGTH = 0; + + /** + * Ignore words greater than this length or if 0 then this has no effect. + * + * @see #getMaxWordLen + * @see #setMaxWordLen + */ + public static final int DEFAULT_MAX_WORD_LENGTH = 0; + + /** + * Default set of stopwords. + * If null means to allow stop words. + * + * @see #setStopWords + * @see #getStopWords + */ + public static final Set DEFAULT_STOP_WORDS = null; + + /** + * Current set of stop words. + */ + private Set stopWords = DEFAULT_STOP_WORDS; + + /** + * Return a Query with no more than this many terms. + * + * @see BooleanQuery#getMaxClauseCount + * @see #getMaxQueryTerms + * @see #setMaxQueryTerms + */ + public static final int DEFAULT_MAX_QUERY_TERMS = 25; + + /** + * Analyzer that will be used to parse the doc. + */ + private Analyzer analyzer = null; + + /** + * Ignore words less frequent that this. + */ + private int minTermFreq = DEFAULT_MIN_TERM_FREQ; + + /** + * Ignore words which do not occur in at least this many docs. + */ + private int minDocFreq = DEFAULT_MIN_DOC_FREQ; + + /** + * Ignore words which occur in more than this many docs. + */ + private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; + + /** + * Should we apply a boost to the Query based on the scores? + */ + private boolean boost = DEFAULT_BOOST; + + /** + * Field name we'll analyze. + */ + private String[] fieldNames = DEFAULT_FIELD_NAMES; + + /** + * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; + + /** + * Ignore words if less than this len. + */ + private int minWordLen = DEFAULT_MIN_WORD_LENGTH; + + /** + * Ignore words if greater than this len. + */ + private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; + + /** + * Don't return a query longer than this. + */ + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + + /** + * For idf() calculations. + */ + private TFIDFSimilarity similarity;// = new DefaultSimilarity(); + + /** + * IndexReader to use + */ + private final IndexReader ir; + + /** + * Boost factor to use when boosting the terms + */ + private float boostFactor = 1; + + /** + * Returns the boost factor used when boosting terms + * + * @return the boost factor used when boosting terms + * @see #setBoostFactor(float) + */ + public float getBoostFactor() { + return boostFactor; + } + + /** + * Sets the boost factor to use when boosting terms + * + * @see #getBoostFactor() + */ + public void setBoostFactor(float boostFactor) { + this.boostFactor = boostFactor; + } + + /** + * Constructor requiring an IndexReader. + */ + public XMoreLikeThis(IndexReader ir) { + this(ir, new DefaultSimilarity()); + } + + public XMoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { + this.ir = ir; + this.similarity = sim; + } + + + public TFIDFSimilarity getSimilarity() { + return similarity; + } + + public void setSimilarity(TFIDFSimilarity similarity) { + this.similarity = similarity; + } + + /** + * Returns an analyzer that will be used to parse source doc with. The default analyzer + * is not set. + * + * @return the analyzer that will be used to parse source doc with. + */ + public Analyzer getAnalyzer() { + return analyzer; + } + + /** + * Sets the analyzer to use. An analyzer is not required for generating a query with the + * {@link #like(int)} method, all other 'like' methods require an analyzer. + * + * @param analyzer the analyzer to use to tokenize text. + */ + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } + + /** + * Returns the frequency below which terms will be ignored in the source doc. The default + * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. + * + * @return the frequency below which terms will be ignored in the source doc. + */ + public int getMinTermFreq() { + return minTermFreq; + } + + /** + * Sets the frequency below which terms will be ignored in the source doc. + * + * @param minTermFreq the frequency below which terms will be ignored in the source doc. + */ + public void setMinTermFreq(int minTermFreq) { + this.minTermFreq = minTermFreq; + } + + /** + * Returns the frequency at which words will be ignored which do not occur in at least this + * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. + * + * @return the frequency at which words will be ignored which do not occur in at least this + * many docs. + */ + public int getMinDocFreq() { + return minDocFreq; + } + + /** + * Sets the frequency at which words will be ignored which do not occur in at least this + * many docs. + * + * @param minDocFreq the frequency at which words will be ignored which do not occur in at + * least this many docs. + */ + public void setMinDocFreq(int minDocFreq) { + this.minDocFreq = minDocFreq; + } + + /** + * Returns the maximum frequency in which words may still appear. + * Words that appear in more than this many docs will be ignored. The default frequency is + * {@link #DEFAULT_MAX_DOC_FREQ}. + * + * @return get the maximum frequency at which words are still allowed, + * words which occur in more docs than this are ignored. + */ + public int getMaxDocFreq() { + return maxDocFreq; + } + + /** + * Set the maximum frequency in which words may still appear. Words that appear + * in more than this many docs will be ignored. + * + * @param maxFreq the maximum count of documents that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreq(int maxFreq) { + this.maxDocFreq = maxFreq; + } + + /** + * Set the maximum percentage in which words may still appear. Words that appear + * in more than this many percent of all docs will be ignored. + * + * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreqPct(int maxPercentage) { + this.maxDocFreq = maxPercentage * ir.numDocs() / 100; + } + + + /** + * Returns whether to boost terms in query based on "score" or not. The default is + * {@link #DEFAULT_BOOST}. + * + * @return whether to boost terms in query based on "score" or not. + * @see #setBoost + */ + public boolean isBoost() { + return boost; + } + + /** + * Sets whether to boost terms in query based on "score" or not. + * + * @param boost true to boost terms in query based on "score", false otherwise. + * @see #isBoost + */ + public void setBoost(boolean boost) { + this.boost = boost; + } + + /** + * Returns the field names that will be used when generating the 'More Like This' query. + * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. + * + * @return the field names that will be used when generating the 'More Like This' query. + */ + public String[] getFieldNames() { + return fieldNames; + } + + /** + * Sets the field names that will be used when generating the 'More Like This' query. + * Set this to null for the field names to be determined at runtime from the IndexReader + * provided in the constructor. + * + * @param fieldNames the field names that will be used when generating the 'More Like This' + * query. + */ + public void setFieldNames(String[] fieldNames) { + this.fieldNames = fieldNames; + } + + /** + * Returns the minimum word length below which words will be ignored. Set this to 0 for no + * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. + * + * @return the minimum word length below which words will be ignored. + */ + public int getMinWordLen() { + return minWordLen; + } + + /** + * Sets the minimum word length below which words will be ignored. + * + * @param minWordLen the minimum word length below which words will be ignored. + */ + public void setMinWordLen(int minWordLen) { + this.minWordLen = minWordLen; + } + + /** + * Returns the maximum word length above which words will be ignored. Set this to 0 for no + * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. + * + * @return the maximum word length above which words will be ignored. + */ + public int getMaxWordLen() { + return maxWordLen; + } + + /** + * Sets the maximum word length above which words will be ignored. + * + * @param maxWordLen the maximum word length above which words will be ignored. + */ + public void setMaxWordLen(int maxWordLen) { + this.maxWordLen = maxWordLen; + } + + /** + * Set the set of stopwords. + * Any word in this set is considered "uninteresting" and ignored. + * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as + * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". + * + * @param stopWords set of stopwords, if null it means to allow stop words + * @see #getStopWords + */ + public void setStopWords(Set stopWords) { + this.stopWords = stopWords; + } + + /** + * Get the current stop words being used. + * + * @see #setStopWords + */ + public Set getStopWords() { + return stopWords; + } + + + /** + * Returns the maximum number of query terms that will be included in any generated query. + * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. + * + * @return the maximum number of query terms that will be included in any generated query. + */ + public int getMaxQueryTerms() { + return maxQueryTerms; + } + + /** + * Sets the maximum number of query terms that will be included in any generated query. + * + * @param maxQueryTerms the maximum number of query terms that will be included in any + * generated query. + */ + public void setMaxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + } + + /** + * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + * @see #DEFAULT_MAX_NUM_TOKENS_PARSED + */ + public int getMaxNumTokensParsed() { + return maxNumTokensParsed; + } + + /** + * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + public void setMaxNumTokensParsed(int i) { + maxNumTokensParsed = i; + } + + + /** + * Return a query that will return docs like the passed lucene document ID. + * + * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. + * @return a query that will return docs like the passed lucene document ID. + */ + public Query like(int docNum) throws IOException { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection fields = MultiFields.getIndexedFields(ir); + fieldNames = fields.toArray(new String[fields.size()]); + } + + return createQuery(retrieveTerms(docNum)); + } + + /** + * Return a query that will return docs like the passed Reader. + * + * @return a query that will return docs like the passed Reader. + */ + @Deprecated + public Query like(Reader r, String fieldName) throws IOException { + return like(fieldName, r); + } + + /** + * Return a query that will return docs like the passed Readers. + * This was added in order to treat multi-value fields. + * + * @return a query that will return docs like the passed Readers. + */ + public Query like(String fieldName, Reader... readers) throws IOException { + Map words = new HashMap<>(); + for (Reader r : readers) { + addTermFrequencies(r, words, fieldName); + } + return createQuery(createQueue(words)); + } + + /** + * Create the More like query from a PriorityQueue + */ + private Query createQuery(PriorityQueue q) { + BooleanQuery query = new BooleanQuery(); + Object cur; + int qterms = 0; + float bestScore = 0; + + while ((cur = q.pop()) != null) { + Object[] ar = (Object[]) cur; + TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); + + if (boost) { + if (qterms == 0) { + bestScore = ((Float) ar[2]); + } + float myScore = ((Float) ar[2]); + + tq.setBoost(boostFactor * myScore / bestScore); + } + + try { + query.add(tq, BooleanClause.Occur.SHOULD); + } + catch (BooleanQuery.TooManyClauses ignore) { + break; + } + + qterms++; + if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { + break; + } + } + + return query; + } + + /** + * Create a PriorityQueue from a word->tf map. + * + * @param words a map of words keyed on the word(String) with Int objects as the values. + */ + private PriorityQueue createQueue(Map words) throws IOException { + // have collected all words in doc and their freqs + int numDocs = ir.numDocs(); + FreqQ res = new FreqQ(words.size()); // will order words by score + + for (String word : words.keySet()) { // for every word + int tf = words.get(word).x; // term freq in the source doc + if (minTermFreq > 0 && tf < minTermFreq) { + continue; // filter out words that don't occur enough times in the source + } + + // go through all the fields and find the largest document frequency + String topField = fieldNames[0]; + int docFreq = 0; + for (String fieldName : fieldNames) { + int freq = ir.docFreq(new Term(fieldName, word)); + topField = (freq > docFreq) ? fieldName : topField; + docFreq = (freq > docFreq) ? freq : docFreq; + } + + if (minDocFreq > 0 && docFreq < minDocFreq) { + continue; // filter out words that don't occur in enough docs + } + + if (docFreq > maxDocFreq) { + continue; // filter out words that occur in too many docs + } + + if (docFreq == 0) { + continue; // index update problem? + } + + float idf = similarity.idf(docFreq, numDocs); + float score = tf * idf; + + // only really need 1st 3 entries, other ones are for troubleshooting + res.insertWithOverflow(new Object[]{word, // the word + topField, // the top field + score, // overall score + idf, // idf + docFreq, // freq in all docs + tf + }); + } + return res; + } + + /** + * Describe the parameters that control how the "more like this" query is formed. + */ + public String describeParams() { + StringBuilder sb = new StringBuilder(); + sb.append("\t").append("maxQueryTerms : ").append(maxQueryTerms).append("\n"); + sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n"); + sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n"); + sb.append("\t").append("fieldNames : "); + String delim = ""; + for (String fieldName : fieldNames) { + sb.append(delim).append(fieldName); + delim = ", "; + } + sb.append("\n"); + sb.append("\t").append("boost : ").append(boost).append("\n"); + sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n"); + sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n"); + return sb.toString(); + } + + /** + * Find words for a more-like-this query former. + * + * @param docNum the id of the lucene document from which to find terms + */ + public PriorityQueue retrieveTerms(int docNum) throws IOException { + Map termFreqMap = new HashMap<>(); + for (String fieldName : fieldNames) { + final Fields vectors = ir.getTermVectors(docNum); + final Terms vector; + if (vectors != null) { + vector = vectors.terms(fieldName); + } else { + vector = null; + } + + // field does not store term vector info + if (vector == null) { + Document d = ir.document(docNum); + IndexableField fields[] = d.getFields(fieldName); + for (IndexableField field : fields) { + final String stringValue = field.stringValue(); + if (stringValue != null) { + addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName); + } + } + } else { + addTermFrequencies(termFreqMap, vector); + } + } + + return createQueue(termFreqMap); + } + + /** + * Adds terms and frequencies found in vector into the Map termFreqMap + * + * @param termFreqMap a Map of terms and their frequencies + * @param vector List of terms and their frequencies for a doc/field + */ + private void addTermFrequencies(Map termFreqMap, Terms vector) throws IOException { + final TermsEnum termsEnum = vector.iterator(null); + final CharsRef spare = new CharsRef(); + BytesRef text; + while((text = termsEnum.next()) != null) { + UnicodeUtil.UTF8toUTF16(text, spare); + final String term = spare.toString(); + if (isNoiseWord(term)) { + continue; + } + final int freq = (int) termsEnum.totalTermFreq(); + + // increment frequency + Int cnt = termFreqMap.get(term); + if (cnt == null) { + cnt = new Int(); + termFreqMap.put(term, cnt); + cnt.x = freq; + } else { + cnt.x += freq; + } + } + } + + /** + * Adds term frequencies found by tokenizing text from reader into the Map words + * + * @param r a source of text to be tokenized + * @param termFreqMap a Map of terms and their frequencies + * @param fieldName Used by analyzer for any special per-field analysis + */ + private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName) + throws IOException { + if (analyzer == null) { + throw new UnsupportedOperationException("To use MoreLikeThis without " + + "term vectors, you must provide an Analyzer"); + } + TokenStream ts = analyzer.tokenStream(fieldName, r); + try { + int tokenCount = 0; + // for every token + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + String word = termAtt.toString(); + tokenCount++; + if (tokenCount > maxNumTokensParsed) { + break; + } + if (isNoiseWord(word)) { + continue; + } + + // increment frequency + Int cnt = termFreqMap.get(word); + if (cnt == null) { + termFreqMap.put(word, new Int()); + } else { + cnt.x++; + } + } + ts.end(); + } finally { + IOUtils.closeWhileHandlingException(ts); + } + } + + + /** + * determines if the passed term is likely to be of interest in "more like" comparisons + * + * @param term The word being considered + * @return true if should be ignored, false if should be used in further analysis + */ + private boolean isNoiseWord(String term) { + int len = term.length(); + if (minWordLen > 0 && len < minWordLen) { + return true; + } + if (maxWordLen > 0 && len > maxWordLen) { + return true; + } + return stopWords != null && stopWords.contains(term); + } + + + /** + * Find words for a more-like-this query former. + * The result is a priority queue of arrays with one entry for every word in the document. + * Each array has 6 elements. + * The elements are: + *
    + *
  1. The word (String) + *
  2. The top field that this word comes from (String) + *
  3. The score for this word (Float) + *
  4. The IDF value (Float) + *
  5. The frequency of this word in the index (Integer) + *
  6. The frequency of this word in the source document (Integer) + *
+ * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. + * This method is exposed so that you can identify the "interesting words" in a document. + * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. + * + * @param r the reader that has the content of the document + * @param fieldName field passed to the analyzer to use when analyzing the content + * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first + * @see #retrieveInterestingTerms + */ + public PriorityQueue retrieveTerms(Reader r, String fieldName) throws IOException { + Map words = new HashMap<>(); + addTermFrequencies(r, words, fieldName); + return createQueue(words); + } + + /** + * @see #retrieveInterestingTerms(java.io.Reader, String) + */ + public String[] retrieveInterestingTerms(int docNum) throws IOException { + ArrayList al = new ArrayList<>(maxQueryTerms); + PriorityQueue pq = retrieveTerms(docNum); + Object cur; + int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... + // we just want to return the top words + while (((cur = pq.pop()) != null) && lim-- > 0) { + Object[] ar = (Object[]) cur; + al.add(ar[0]); // the 1st entry is the interesting word + } + String[] res = new String[al.size()]; + return al.toArray(res); + } + + /** + * Convenience routine to make it easy to return the most interesting words in a document. + * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. + * + * @param r the source document + * @param fieldName field passed to analyzer to use when analyzing the content + * @return the most interesting words in the document + * @see #retrieveTerms(java.io.Reader, String) + * @see #setMaxQueryTerms + */ + public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { + ArrayList al = new ArrayList<>(maxQueryTerms); + PriorityQueue pq = retrieveTerms(r, fieldName); + Object cur; + int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... + // we just want to return the top words + while (((cur = pq.pop()) != null) && lim-- > 0) { + Object[] ar = (Object[]) cur; + al.add(ar[0]); // the 1st entry is the interesting word + } + String[] res = new String[al.size()]; + return al.toArray(res); + } + + /** + * PriorityQueue that orders words by score. + */ + private static class FreqQ extends PriorityQueue { + FreqQ(int s) { + super(s); + } + + @Override + protected boolean lessThan(Object[] aa, Object[] bb) { + Float fa = (Float) aa[2]; + Float fb = (Float) bb[2]; + return fa > fb; + } + } + + /** + * Use for frequencies and to avoid renewing Integers. + */ + private static class Int { + int x; + + Int() { + x = 1; + } + } +} diff --git a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java index 914a1cf0c1b..1249de3b0c5 100644 --- a/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java +++ b/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java @@ -39,6 +39,7 @@ import org.elasticsearch.index.analysis.Analysis; import org.elasticsearch.index.mapper.Uid; import org.elasticsearch.index.mapper.internal.UidFieldMapper; import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService; +import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText; import java.io.IOException; import java.util.*; @@ -205,11 +206,11 @@ public class MoreLikeThisQueryParser implements QueryParser { } } // fetching the items with multi-get - List likeTexts = fetchService.fetch(items); + List likeTexts = fetchService.fetch(items); // right now we are just building a boolean query BooleanQuery boolQuery = new BooleanQuery(); - for (MoreLikeThisFetchService.LikeText likeText : likeTexts) { - addMoreLikeThis(boolQuery, mltQuery, likeText.field, likeText.text); + for (LikeText likeText : likeTexts) { + addMoreLikeThis(boolQuery, mltQuery, likeText); } // exclude the items from the search if (!include) { @@ -227,10 +228,10 @@ public class MoreLikeThisQueryParser implements QueryParser { return mltQuery; } - private void addMoreLikeThis(BooleanQuery boolQuery, MoreLikeThisQuery mltQuery, String fieldName, String likeText) { + private void addMoreLikeThis(BooleanQuery boolQuery, MoreLikeThisQuery mltQuery, LikeText likeText) { MoreLikeThisQuery mlt = new MoreLikeThisQuery(); - mlt.setMoreLikeFields(new String[] {fieldName}); - mlt.setLikeText(likeText); + mlt.setMoreLikeFields(new String[] {likeText.field}); + mlt.setLikeText(likeText.text); mlt.setAnalyzer(mltQuery.getAnalyzer()); mlt.setPercentTermsToMatch(mltQuery.getPercentTermsToMatch()); mlt.setBoostTerms(mltQuery.isBoostTerms()); diff --git a/src/main/java/org/elasticsearch/index/search/morelikethis/MoreLikeThisFetchService.java b/src/main/java/org/elasticsearch/index/search/morelikethis/MoreLikeThisFetchService.java index 3763c225d28..92a42412244 100644 --- a/src/main/java/org/elasticsearch/index/search/morelikethis/MoreLikeThisFetchService.java +++ b/src/main/java/org/elasticsearch/index/search/morelikethis/MoreLikeThisFetchService.java @@ -40,9 +40,14 @@ public class MoreLikeThisFetchService extends AbstractComponent { public static final class LikeText { public final String field; - public final String text; + public final String[] text; public LikeText(String field, String text) { + this.field = field; + this.text = new String[]{text}; + } + + public LikeText(String field, String... text) { this.field = field; this.text = text; } @@ -73,9 +78,11 @@ public class MoreLikeThisFetchService extends AbstractComponent { } for (GetField getField : getResponse.getFields().values()) { - for (Object value : getField.getValues()) { - likeTexts.add(new LikeText(getField.getName(), value.toString())); + String[] text = new String[getField.getValues().size()]; + for (int i = 0; i < text.length; i++) { + text[i] = getField.getValues().get(i).toString(); } + likeTexts.add(new LikeText(getField.getName(), text)); } } return likeTexts; diff --git a/src/test/java/org/elasticsearch/index/query/ItemSerializationTests.java b/src/test/java/org/elasticsearch/index/query/ItemSerializationTests.java index 765ce4585ee..5bc1b284c78 100644 --- a/src/test/java/org/elasticsearch/index/query/ItemSerializationTests.java +++ b/src/test/java/org/elasticsearch/index/query/ItemSerializationTests.java @@ -41,22 +41,12 @@ import static org.hamcrest.Matchers.is; public class ItemSerializationTests extends ElasticsearchTestCase { - private String[] generateRandomStringArray(int arraySize, int stringSize) { - String[] array = randomBoolean() ? new String[randomInt(arraySize)] : null; // allow empty arrays - if (array != null) { - for (int i = 0; i < array.length; i++) { - array[i] = randomAsciiOfLength(stringSize); - } - } - return array; - } - private Item generateRandomItem(int arraySize, int stringSize) { String index = randomAsciiOfLength(stringSize); String type = randomAsciiOfLength(stringSize); String id = String.valueOf(Math.abs(randomInt())); String routing = randomBoolean() ? randomAsciiOfLength(stringSize) : null; - String[] fields = generateRandomStringArray(arraySize, stringSize); + String[] fields = generateRandomStringArray(arraySize, stringSize, true); long version = Math.abs(randomLong()); VersionType versionType = RandomPicks.randomFrom(new Random(), VersionType.values()); @@ -67,11 +57,11 @@ public class ItemSerializationTests extends ElasticsearchTestCase { fetchSourceContext = new FetchSourceContext(randomBoolean()); break; case 1 : - fetchSourceContext = new FetchSourceContext(generateRandomStringArray(arraySize, stringSize)); + fetchSourceContext = new FetchSourceContext(generateRandomStringArray(arraySize, stringSize, true)); break; case 2 : - fetchSourceContext = new FetchSourceContext(generateRandomStringArray(arraySize, stringSize), - generateRandomStringArray(arraySize, stringSize)); + fetchSourceContext = new FetchSourceContext(generateRandomStringArray(arraySize, stringSize, true), + generateRandomStringArray(arraySize, stringSize, true)); break; default: fetchSourceContext = null; diff --git a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java index ee5bb483c64..7127b6c3b1d 100644 --- a/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java +++ b/src/test/java/org/elasticsearch/index/query/SimpleIndexQueryParserTests.java @@ -1701,10 +1701,11 @@ public class SimpleIndexQueryParserTests extends ElasticsearchTestCase { // check each clause is for each item BooleanClause[] boolClauses = booleanQuery.getClauses(); for (int i=0; i builders = new ArrayList<>(values.length + 1); + // index one document with all the values + builders.add(client().prepareIndex("test", "type1", "0").setSource("text", values)); + // index each document with only one of the values + for (int i = 0; i < values.length; i++) { + builders.add(client().prepareIndex("test", "type1", String.valueOf(i + 1)).setSource("text", values[i])); + } + indexRandom(true, builders); + + int maxIters = randomIntBetween(10, 20); + for (int i = 0; i < maxIters; i++) + { + int max_query_terms = randomIntBetween(1, values.length); + logger.info("Running More Like This with max_query_terms = %s", max_query_terms); + MoreLikeThisQueryBuilder mltQuery = moreLikeThisQuery("text").ids("0").minTermFreq(1).minDocFreq(1) + .maxQueryTerms(max_query_terms).percentTermsToMatch(0); + SearchResponse response = client().prepareSearch("test").setTypes("type1") + .setQuery(mltQuery).execute().actionGet(); + assertSearchResponse(response); + assertHitCount(response, max_query_terms); + + logger.info("Running More Like This API with with max_query_terms = %s returns all docs!", max_query_terms); + response = client().moreLikeThis(moreLikeThisRequest("test").type("type1") + .id("0").fields("text").minTermFreq(1).minDocFreq(1) + .maxQueryTerms(max_query_terms).percentTermsToMatch(0)) + .actionGet(); + assertSearchResponse(response); + assertHitCount(response, values.length); + } + } } diff --git a/src/test/java/org/elasticsearch/test/ElasticsearchTestCase.java b/src/test/java/org/elasticsearch/test/ElasticsearchTestCase.java index 0c1fdbc94fe..cdf28192286 100644 --- a/src/test/java/org/elasticsearch/test/ElasticsearchTestCase.java +++ b/src/test/java/org/elasticsearch/test/ElasticsearchTestCase.java @@ -294,4 +294,19 @@ public abstract class ElasticsearchTestCase extends AbstractRandomizedTest { public static T randomFrom(T... values) { return RandomizedTest.randomFrom(values); } + + public static String[] generateRandomStringArray(int maxArraySize, int maxStringSize, boolean allowNull) { + if (allowNull && randomBoolean()) { + return null; + } + String[] array = new String[randomInt(maxArraySize)]; // allow empty arrays + for (int i = 0; i < array.length; i++) { + array[i] = randomAsciiOfLength(maxStringSize); + } + return array; + } + + public static String[] generateRandomStringArray(int maxArraySize, int maxStringSize) { + return generateRandomStringArray(maxArraySize, maxStringSize, false); + } }