LUCENE-5478: CommonTermsQuery now allows to create custom term queries

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1572613 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2014-02-27 15:14:46 +00:00
parent 33b8e75a42
commit 7877ebeb63
3 changed files with 107 additions and 21 deletions

View File

@ -73,6 +73,10 @@ New Features
* LUCENE-5454: Add SortedSetSortField to lucene/sandbox, to allow sorting
on multi-valued field. (Robert Muir)
* LUCENE-5478: CommonTermsQuery now allows to create custom term queries
similar to the query parser by overriding a newTermQuery method.
(Simon Willnauer)
API Changes
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues

View File

@ -16,11 +16,6 @@ package org.apache.lucene.queries;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
@ -30,12 +25,17 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
/**
* A query that executes high-frequency terms in a optional sub-query to prevent
* slow queries due to "common" terms like stopwords. This query
@ -149,7 +149,7 @@ public class CommonTermsQuery extends Query {
if (this.terms.isEmpty()) {
return new BooleanQuery();
} else if (this.terms.size() == 1) {
final TermQuery tq = new TermQuery(this.terms.get(0));
final Query tq = newTermQuery(this.terms.get(0), null);
tq.setBoost(getBoost());
return tq;
}
@ -186,7 +186,7 @@ public class CommonTermsQuery extends Query {
for (int i = 0; i < queryTerms.length; i++) {
TermContext termContext = contextArray[i];
if (termContext == null) {
lowFreq.add(new TermQuery(queryTerms[i]), lowFreqOccur);
lowFreq.add(newTermQuery(queryTerms[i], null), lowFreqOccur);
} else {
if ((maxTermFrequency >= 1f && termContext.docFreq() > maxTermFrequency)
|| (termContext.docFreq() > (int) Math.ceil(maxTermFrequency
@ -351,7 +351,7 @@ public class CommonTermsQuery extends Query {
}
for (int i = 0; i < terms.size(); i++) {
Term t = terms.get(i);
buffer.append(new TermQuery(t).toString());
buffer.append(newTermQuery(t, null).toString());
if (i != terms.size() - 1) buffer.append(", ");
}
@ -412,4 +412,14 @@ public class CommonTermsQuery extends Query {
return true;
}
/**
* Builds a new TermQuery instance.
* <p>This is intended for subclasses that wish to customize the generated queries.</p>
* @param term term
* @param context the TermContext to be used to create the low level term query. Can be <code>null</code>.
* @return new TermQuery instance
*/
protected Query newTermQuery(Term term, TermContext context) {
return context == null ? new TermQuery(term) : new TermQuery(term, context);
}
}

View File

@ -17,14 +17,6 @@ package org.apache.lucene.queries;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.AtomicReader;
@ -33,12 +25,14 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryUtils;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
@ -49,7 +43,15 @@ import org.apache.lucene.util.LineFileDocs;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.TestUtil;
import org.junit.Test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
public class CommonTermsQueryTest extends LuceneTestCase {
@ -339,6 +341,60 @@ public class CommonTermsQueryTest extends LuceneTestCase {
}
}
@Test
public void testExtend() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
String[] docs = new String[] {"this is the end of the world right",
"is this it or maybe not",
"this is the end of the universe as we know it",
"there is the famous restaurant at the end of the universe",};
for (int i = 0; i < docs.length; i++) {
Document doc = new Document();
doc.add(newStringField("id", "" + i, Field.Store.YES));
doc.add(newTextField("field", docs[i], Field.Store.NO));
w.addDocument(doc);
}
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
{
CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
random().nextBoolean() ? 2.0f : 0.5f);
query.add(new Term("field", "is"));
query.add(new Term("field", "this"));
query.add(new Term("field", "end"));
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 3);
assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));
}
{
// this one boosts the termQuery("field" "universe") by 10x
CommonTermsQuery query = new ExtendedCommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
random().nextBoolean() ? 2.0f : 0.5f);
query.add(new Term("field", "is"));
query.add(new Term("field", "this"));
query.add(new Term("field", "end"));
query.add(new Term("field", "world"));
query.add(new Term("field", "universe"));
query.add(new Term("field", "right"));
TopDocs search = s.search(query, 10);
assertEquals(search.totalHits, 3);
assertEquals("2", r.document(search.scoreDocs[0].doc).get("id"));
assertEquals("3", r.document(search.scoreDocs[1].doc).get("id"));
assertEquals("0", r.document(search.scoreDocs[2].doc).get("id"));
}
r.close();
w.close();
dir.close();
}
public void testRandomIndex() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
@ -480,4 +536,20 @@ public class CommonTermsQueryTest extends LuceneTestCase {
lineFileDocs.close();
}
private static final class ExtendedCommonTermsQuery extends CommonTermsQuery {
public ExtendedCommonTermsQuery(Occur highFreqOccur, Occur lowFreqOccur, float maxTermFrequency) {
super(highFreqOccur, lowFreqOccur, maxTermFrequency);
}
@Override
protected Query newTermQuery(Term term, TermContext context) {
Query query = super.newTermQuery(term, context);
if (term.text().equals("universe")) {
query.setBoost(100f);
}
return query;
}
}
}