diff --git a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java
index ef4ba65e9d0..32110220fcb 100644
--- a/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java
+++ b/src/main/java/org/elasticsearch/common/lucene/search/MoreLikeThisQuery.java
@@ -21,7 +21,6 @@ package org.elasticsearch.common.lucene.search;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
@@ -31,6 +30,7 @@ import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.elasticsearch.common.io.FastStringReader;
import java.io.IOException;
+import java.io.Reader;
import java.util.Arrays;
import java.util.Set;
@@ -43,18 +43,18 @@ public class MoreLikeThisQuery extends Query {
private TFIDFSimilarity similarity;
- private String likeText;
+ private String[] likeText;
private String[] moreLikeFields;
private Analyzer analyzer;
private float percentTermsToMatch = DEFAULT_PERCENT_TERMS_TO_MATCH;
- private int minTermFrequency = MoreLikeThis.DEFAULT_MIN_TERM_FREQ;
- private int maxQueryTerms = MoreLikeThis.DEFAULT_MAX_QUERY_TERMS;
- private Set> stopWords = MoreLikeThis.DEFAULT_STOP_WORDS;
- private int minDocFreq = MoreLikeThis.DEFAULT_MIN_DOC_FREQ;
- private int maxDocFreq = MoreLikeThis.DEFAULT_MAX_DOC_FREQ;
- private int minWordLen = MoreLikeThis.DEFAULT_MIN_WORD_LENGTH;
- private int maxWordLen = MoreLikeThis.DEFAULT_MAX_WORD_LENGTH;
- private boolean boostTerms = MoreLikeThis.DEFAULT_BOOST;
+ private int minTermFrequency = XMoreLikeThis.DEFAULT_MIN_TERM_FREQ;
+ private int maxQueryTerms = XMoreLikeThis.DEFAULT_MAX_QUERY_TERMS;
+ private Set> stopWords = XMoreLikeThis.DEFAULT_STOP_WORDS;
+ private int minDocFreq = XMoreLikeThis.DEFAULT_MIN_DOC_FREQ;
+ private int maxDocFreq = XMoreLikeThis.DEFAULT_MAX_DOC_FREQ;
+ private int minWordLen = XMoreLikeThis.DEFAULT_MIN_WORD_LENGTH;
+ private int maxWordLen = XMoreLikeThis.DEFAULT_MAX_WORD_LENGTH;
+ private boolean boostTerms = XMoreLikeThis.DEFAULT_BOOST;
private float boostTermsFactor = 1;
@@ -63,7 +63,7 @@ public class MoreLikeThisQuery extends Query {
}
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
- this.likeText = likeText;
+ this.likeText = new String[]{likeText};
this.moreLikeFields = moreLikeFields;
this.analyzer = analyzer;
}
@@ -72,7 +72,7 @@ public class MoreLikeThisQuery extends Query {
public int hashCode() {
int result = boostTerms ? 1 : 0;
result = 31 * result + Float.floatToIntBits(boostTermsFactor);
- result = 31 * result + likeText.hashCode();
+ result = 31 * result + Arrays.hashCode(likeText);
result = 31 * result + maxDocFreq;
result = 31 * result + maxQueryTerms;
result = 31 * result + maxWordLen;
@@ -99,7 +99,7 @@ public class MoreLikeThisQuery extends Query {
return false;
if (boostTermsFactor != other.boostTermsFactor)
return false;
- if (!likeText.equals(other.likeText))
+ if (!(Arrays.equals(likeText, other.likeText)))
return false;
if (maxDocFreq != other.maxDocFreq)
return false;
@@ -132,7 +132,7 @@ public class MoreLikeThisQuery extends Query {
@Override
public Query rewrite(IndexReader reader) throws IOException {
- MoreLikeThis mlt = new MoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity);
+ XMoreLikeThis mlt = new XMoreLikeThis(reader, similarity == null ? new DefaultSimilarity() : similarity);
mlt.setFieldNames(moreLikeFields);
mlt.setAnalyzer(analyzer);
@@ -145,10 +145,15 @@ public class MoreLikeThisQuery extends Query {
mlt.setStopWords(stopWords);
mlt.setBoost(boostTerms);
mlt.setBoostFactor(boostTermsFactor);
- //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
- BooleanQuery bq = (BooleanQuery) mlt.like(new FastStringReader(likeText), moreLikeFields[0]);
- BooleanClause[] clauses = bq.getClauses();
+ Reader[] readers = new Reader[likeText.length];
+ for (int i = 0; i < readers.length; i++) {
+ readers[i] = new FastStringReader(likeText[i]);
+ }
+ //LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
+ BooleanQuery bq = (BooleanQuery) mlt.like(moreLikeFields[0], readers);
+
+ BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
bq.setBoost(getBoost());
@@ -157,14 +162,22 @@ public class MoreLikeThisQuery extends Query {
@Override
public String toString(String field) {
- return "like:" + likeText;
+ return "like:" + Arrays.toString(likeText);
}
public String getLikeText() {
+ return (likeText == null ? null : likeText[0]);
+ }
+
+ public String[] getLikeTexts() {
return likeText;
}
public void setLikeText(String likeText) {
+ this.likeText = new String[]{likeText};
+ }
+
+ public void setLikeText(String... likeText) {
this.likeText = likeText;
}
diff --git a/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java b/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java
new file mode 100644
index 00000000000..f2314afe3b5
--- /dev/null
+++ b/src/main/java/org/elasticsearch/common/lucene/search/XMoreLikeThis.java
@@ -0,0 +1,964 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Copyright 2004-2005 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.elasticsearch.common.lucene.search;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexableField;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.similarities.DefaultSimilarity;
+import org.apache.lucene.search.similarities.TFIDFSimilarity;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.UnicodeUtil;
+import org.elasticsearch.Version;
+import org.elasticsearch.common.io.FastStringReader;
+
+/**
+ * Generate "more like this" similarity queries.
+ * Based on this mail:
+ *
+ *
+ * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
+ * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
+ * is usually fast enough. But looking up the docFreq() of every term in the document is
+ * probably too slow.
+ *
+ * You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
+ * or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
+ * in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
+ * reduce the number of terms under consideration. Another heuristic is that terms with a
+ * high idf (i.e., a low df) tend to be longer. So you could threshold the terms by the
+ * number of characters, not selecting anything less than, e.g., six or seven characters.
+ * With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
+ * that do a pretty good job of characterizing a document.
+ *
+ * It all depends on what you're trying to do. If you're trying to eek out that last percent
+ * of precision and recall regardless of computational difficulty so that you can win a TREC
+ * competition, then the techniques I mention above are useless. But if you're trying to
+ * provide a "more like this" button on a search results page that does a decent job and has
+ * good performance, such techniques might be useful.
+ *
+ * An efficient, effective "more-like-this" query generator would be a great contribution, if
+ * anyone's interested. I'd imagine that it would take a Reader or a String (the document's
+ * text), analyzer Analyzer, and return a set of representative terms using heuristics like those
+ * above. The frequency and length thresholds could be parameters, etc.
+ *
+ * Doug
+ *
+ * + * IndexReader ir = ... + * IndexSearcher is = ... + * + * MoreLikeThis mlt = new MoreLikeThis(ir); + * Reader target = ... // orig source of doc you want to find similarities to + * Query query = mlt.like( target); + * + * Hits hits = is.search(query); + * // now the usual iteration thru 'hits' - the only thing to watch for is to make sure + * //you ignore the doc if it matches your 'target' document, as it should be similar to itself + * + *+ * + * Thus you: + *
+ * Changes: Mark Harwood 29/02/04 + * Some bugfixing, some refactoring, some optimisation. + * - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code + * - bugfix: No significant terms being created for fields with a termvector - because + * was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector) + * - refactor: moved common code into isNoiseWord() + * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization + *+ */ + +public final class XMoreLikeThis { + + static { + assert Version.CURRENT.luceneVersion == org.apache.lucene.util.Version.LUCENE_48: "Remove this class once we upgrade to Lucene 4.9"; + } + + /** + * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. + * + * @see #getMaxNumTokensParsed + */ + public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; + + /** + * Ignore terms with less than this frequency in the source doc. + * + * @see #getMinTermFreq + * @see #setMinTermFreq + */ + public static final int DEFAULT_MIN_TERM_FREQ = 2; + + /** + * Ignore words which do not occur in at least this many docs. + * + * @see #getMinDocFreq + * @see #setMinDocFreq + */ + public static final int DEFAULT_MIN_DOC_FREQ = 5; + + /** + * Ignore words which occur in more than this many docs. + * + * @see #getMaxDocFreq + * @see #setMaxDocFreq + * @see #setMaxDocFreqPct + */ + public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; + + /** + * Boost terms in query based on score. + * + * @see #isBoost + * @see #setBoost + */ + public static final boolean DEFAULT_BOOST = false; + + /** + * Default field names. Null is used to specify that the field names should be looked + * up at runtime from the provided reader. + */ + public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; + + /** + * Ignore words less than this length or if 0 then this has no effect. + * + * @see #getMinWordLen + * @see #setMinWordLen + */ + public static final int DEFAULT_MIN_WORD_LENGTH = 0; + + /** + * Ignore words greater than this length or if 0 then this has no effect. + * + * @see #getMaxWordLen + * @see #setMaxWordLen + */ + public static final int DEFAULT_MAX_WORD_LENGTH = 0; + + /** + * Default set of stopwords. + * If null means to allow stop words. + * + * @see #setStopWords + * @see #getStopWords + */ + public static final Set> DEFAULT_STOP_WORDS = null; + + /** + * Current set of stop words. + */ + private Set> stopWords = DEFAULT_STOP_WORDS; + + /** + * Return a Query with no more than this many terms. + * + * @see BooleanQuery#getMaxClauseCount + * @see #getMaxQueryTerms + * @see #setMaxQueryTerms + */ + public static final int DEFAULT_MAX_QUERY_TERMS = 25; + + /** + * Analyzer that will be used to parse the doc. + */ + private Analyzer analyzer = null; + + /** + * Ignore words less frequent that this. + */ + private int minTermFreq = DEFAULT_MIN_TERM_FREQ; + + /** + * Ignore words which do not occur in at least this many docs. + */ + private int minDocFreq = DEFAULT_MIN_DOC_FREQ; + + /** + * Ignore words which occur in more than this many docs. + */ + private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; + + /** + * Should we apply a boost to the Query based on the scores? + */ + private boolean boost = DEFAULT_BOOST; + + /** + * Field name we'll analyze. + */ + private String[] fieldNames = DEFAULT_FIELD_NAMES; + + /** + * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; + + /** + * Ignore words if less than this len. + */ + private int minWordLen = DEFAULT_MIN_WORD_LENGTH; + + /** + * Ignore words if greater than this len. + */ + private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; + + /** + * Don't return a query longer than this. + */ + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + + /** + * For idf() calculations. + */ + private TFIDFSimilarity similarity;// = new DefaultSimilarity(); + + /** + * IndexReader to use + */ + private final IndexReader ir; + + /** + * Boost factor to use when boosting the terms + */ + private float boostFactor = 1; + + /** + * Returns the boost factor used when boosting terms + * + * @return the boost factor used when boosting terms + * @see #setBoostFactor(float) + */ + public float getBoostFactor() { + return boostFactor; + } + + /** + * Sets the boost factor to use when boosting terms + * + * @see #getBoostFactor() + */ + public void setBoostFactor(float boostFactor) { + this.boostFactor = boostFactor; + } + + /** + * Constructor requiring an IndexReader. + */ + public XMoreLikeThis(IndexReader ir) { + this(ir, new DefaultSimilarity()); + } + + public XMoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { + this.ir = ir; + this.similarity = sim; + } + + + public TFIDFSimilarity getSimilarity() { + return similarity; + } + + public void setSimilarity(TFIDFSimilarity similarity) { + this.similarity = similarity; + } + + /** + * Returns an analyzer that will be used to parse source doc with. The default analyzer + * is not set. + * + * @return the analyzer that will be used to parse source doc with. + */ + public Analyzer getAnalyzer() { + return analyzer; + } + + /** + * Sets the analyzer to use. An analyzer is not required for generating a query with the + * {@link #like(int)} method, all other 'like' methods require an analyzer. + * + * @param analyzer the analyzer to use to tokenize text. + */ + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } + + /** + * Returns the frequency below which terms will be ignored in the source doc. The default + * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. + * + * @return the frequency below which terms will be ignored in the source doc. + */ + public int getMinTermFreq() { + return minTermFreq; + } + + /** + * Sets the frequency below which terms will be ignored in the source doc. + * + * @param minTermFreq the frequency below which terms will be ignored in the source doc. + */ + public void setMinTermFreq(int minTermFreq) { + this.minTermFreq = minTermFreq; + } + + /** + * Returns the frequency at which words will be ignored which do not occur in at least this + * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. + * + * @return the frequency at which words will be ignored which do not occur in at least this + * many docs. + */ + public int getMinDocFreq() { + return minDocFreq; + } + + /** + * Sets the frequency at which words will be ignored which do not occur in at least this + * many docs. + * + * @param minDocFreq the frequency at which words will be ignored which do not occur in at + * least this many docs. + */ + public void setMinDocFreq(int minDocFreq) { + this.minDocFreq = minDocFreq; + } + + /** + * Returns the maximum frequency in which words may still appear. + * Words that appear in more than this many docs will be ignored. The default frequency is + * {@link #DEFAULT_MAX_DOC_FREQ}. + * + * @return get the maximum frequency at which words are still allowed, + * words which occur in more docs than this are ignored. + */ + public int getMaxDocFreq() { + return maxDocFreq; + } + + /** + * Set the maximum frequency in which words may still appear. Words that appear + * in more than this many docs will be ignored. + * + * @param maxFreq the maximum count of documents that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreq(int maxFreq) { + this.maxDocFreq = maxFreq; + } + + /** + * Set the maximum percentage in which words may still appear. Words that appear + * in more than this many percent of all docs will be ignored. + * + * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreqPct(int maxPercentage) { + this.maxDocFreq = maxPercentage * ir.numDocs() / 100; + } + + + /** + * Returns whether to boost terms in query based on "score" or not. The default is + * {@link #DEFAULT_BOOST}. + * + * @return whether to boost terms in query based on "score" or not. + * @see #setBoost + */ + public boolean isBoost() { + return boost; + } + + /** + * Sets whether to boost terms in query based on "score" or not. + * + * @param boost true to boost terms in query based on "score", false otherwise. + * @see #isBoost + */ + public void setBoost(boolean boost) { + this.boost = boost; + } + + /** + * Returns the field names that will be used when generating the 'More Like This' query. + * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. + * + * @return the field names that will be used when generating the 'More Like This' query. + */ + public String[] getFieldNames() { + return fieldNames; + } + + /** + * Sets the field names that will be used when generating the 'More Like This' query. + * Set this to null for the field names to be determined at runtime from the IndexReader + * provided in the constructor. + * + * @param fieldNames the field names that will be used when generating the 'More Like This' + * query. + */ + public void setFieldNames(String[] fieldNames) { + this.fieldNames = fieldNames; + } + + /** + * Returns the minimum word length below which words will be ignored. Set this to 0 for no + * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. + * + * @return the minimum word length below which words will be ignored. + */ + public int getMinWordLen() { + return minWordLen; + } + + /** + * Sets the minimum word length below which words will be ignored. + * + * @param minWordLen the minimum word length below which words will be ignored. + */ + public void setMinWordLen(int minWordLen) { + this.minWordLen = minWordLen; + } + + /** + * Returns the maximum word length above which words will be ignored. Set this to 0 for no + * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. + * + * @return the maximum word length above which words will be ignored. + */ + public int getMaxWordLen() { + return maxWordLen; + } + + /** + * Sets the maximum word length above which words will be ignored. + * + * @param maxWordLen the maximum word length above which words will be ignored. + */ + public void setMaxWordLen(int maxWordLen) { + this.maxWordLen = maxWordLen; + } + + /** + * Set the set of stopwords. + * Any word in this set is considered "uninteresting" and ignored. + * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as + * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". + * + * @param stopWords set of stopwords, if null it means to allow stop words + * @see #getStopWords + */ + public void setStopWords(Set> stopWords) { + this.stopWords = stopWords; + } + + /** + * Get the current stop words being used. + * + * @see #setStopWords + */ + public Set> getStopWords() { + return stopWords; + } + + + /** + * Returns the maximum number of query terms that will be included in any generated query. + * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. + * + * @return the maximum number of query terms that will be included in any generated query. + */ + public int getMaxQueryTerms() { + return maxQueryTerms; + } + + /** + * Sets the maximum number of query terms that will be included in any generated query. + * + * @param maxQueryTerms the maximum number of query terms that will be included in any + * generated query. + */ + public void setMaxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + } + + /** + * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + * @see #DEFAULT_MAX_NUM_TOKENS_PARSED + */ + public int getMaxNumTokensParsed() { + return maxNumTokensParsed; + } + + /** + * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + public void setMaxNumTokensParsed(int i) { + maxNumTokensParsed = i; + } + + + /** + * Return a query that will return docs like the passed lucene document ID. + * + * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. + * @return a query that will return docs like the passed lucene document ID. + */ + public Query like(int docNum) throws IOException { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection