LUCENE-7624: Move TermsQuery into core as TermInSetQuery

This commit is contained in:
Alan Woodward 2017-01-09 14:01:33 +00:00
parent 5e9f9279cc
commit 22940f5c49
13 changed files with 451 additions and 408 deletions

View File

@ -85,6 +85,9 @@ API Changes
* LUCENE-7611: DocumentValueSourceDictionary now takes a LongValuesSource
as a parameter, and the ValueSource equivalent is deprecated (Alan Woodward)
* LUCENE-7624: TermsQuery has been renamed as TermInSetQuery and moved to core.
(Alan Woodward)
New features
* LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand)

View File

@ -0,0 +1,369 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Specialization for a disjunction over many terms that behaves like a
* {@link ConstantScoreQuery} over a {@link BooleanQuery} containing only
* {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
* <p>For instance in the following example, both @{code q1} and {@code q2}
* would yield the same scores:
* <pre class="prettyprint">
* Query q1 = new TermInSetQuery(new Term("field", "foo"), new Term("field", "bar"));
*
* BooleanQuery bq = new BooleanQuery();
* bq.add(new TermQuery(new Term("field", "foo")), Occur.SHOULD);
* bq.add(new TermQuery(new Term("field", "bar")), Occur.SHOULD);
* Query q2 = new ConstantScoreQuery(bq);
* </pre>
* <p>When there are few terms, this query executes like a regular disjunction.
* However, when there are many terms, instead of merging iterators on the fly,
* it will populate a bit set with matching docs and return a {@link Scorer}
* over this bit set.
* <p>NOTE: This query produces scores that are equal to its boost
*/
public class TermInSetQuery extends Query implements Accountable {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermInSetQuery.class);
// Same threshold as MultiTermQueryConstantScoreWrapper
static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
private final boolean singleField; // whether all terms are from the same field
private final PrefixCodedTerms termData;
private final int termDataHashCode; // cached hashcode of termData
/**
* Creates a new {@link TermInSetQuery} from the given collection. It
* can contain duplicate terms and multiple fields.
*/
public TermInSetQuery(Collection<Term> terms) {
Term[] sortedTerms = terms.toArray(new Term[terms.size()]);
// already sorted if we are a SortedSet with natural order
boolean sorted = terms instanceof SortedSet && ((SortedSet<Term>)terms).comparator() == null;
if (!sorted) {
ArrayUtil.timSort(sortedTerms);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
Set<String> fields = new HashSet<>();
Term previous = null;
for (Term term : sortedTerms) {
if (term.equals(previous) == false) {
fields.add(term.field());
builder.add(term);
}
previous = term;
}
singleField = fields.size() == 1;
termData = builder.finish();
termDataHashCode = termData.hashCode();
}
/**
* Creates a new {@link TermInSetQuery} from the given collection for
* a single field. It can contain duplicate terms.
*/
public TermInSetQuery(String field, Collection<BytesRef> terms) {
BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
// already sorted if we are a SortedSet with natural order
boolean sorted = terms instanceof SortedSet && ((SortedSet<BytesRef>)terms).comparator() == null;
if (!sorted) {
ArrayUtil.timSort(sortedTerms);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
BytesRefBuilder previous = null;
for (BytesRef term : sortedTerms) {
if (previous == null) {
previous = new BytesRefBuilder();
} else if (previous.get().equals(term)) {
continue; // deduplicate
}
builder.add(field, term);
previous.copyBytes(term);
}
singleField = true;
termData = builder.finish();
termDataHashCode = termData.hashCode();
}
/**
* Creates a new {@link TermInSetQuery} from the given {@link BytesRef} array for
* a single field.
*/
public TermInSetQuery(String field, BytesRef...terms) {
this(field, Arrays.asList(terms));
}
/**
* Creates a new {@link TermInSetQuery} from the given array. The array can
* contain duplicate terms and multiple fields.
*/
public TermInSetQuery(final Term... terms) {
this(Arrays.asList(terms));
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
if (termData.size() <= threshold) {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
bq.add(new TermQuery(new Term(iterator.field(), BytesRef.deepCopyOf(term))), Occur.SHOULD);
}
return new ConstantScoreQuery(bq.build());
}
return super.rewrite(reader);
}
@Override
public boolean equals(Object other) {
return sameClassAs(other) &&
equalsTo(getClass().cast(other));
}
private boolean equalsTo(TermInSetQuery other) {
// termData might be heavy to compare so check the hash code first
return termDataHashCode == other.termDataHashCode &&
termData.equals(other.termData);
}
@Override
public int hashCode() {
return 31 * classHash() + termDataHashCode;
}
/** Returns the terms wrapped in a PrefixCodedTerms. */
public PrefixCodedTerms getTermData() {
return termData;
}
@Override
public String toString(String defaultField) {
StringBuilder builder = new StringBuilder();
boolean first = true;
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
if (!first) {
builder.append(' ');
}
first = false;
builder.append(new Term(iterator.field(), term).toString());
}
return builder.toString();
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED + termData.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}
private static class TermAndState {
final String field;
final TermsEnum termsEnum;
final BytesRef term;
final TermState state;
final int docFreq;
final long totalTermFreq;
TermAndState(String field, TermsEnum termsEnum) throws IOException {
this.field = field;
this.termsEnum = termsEnum;
this.term = BytesRef.deepCopyOf(termsEnum.term());
this.state = termsEnum.termState();
this.docFreq = termsEnum.docFreq();
this.totalTermFreq = termsEnum.totalTermFreq();
}
}
private static class WeightOrDocIdSet {
final Weight weight;
final DocIdSet set;
WeightOrDocIdSet(Weight weight) {
this.weight = Objects.requireNonNull(weight);
this.set = null;
}
WeightOrDocIdSet(DocIdSet bitset) {
this.set = bitset;
this.weight = null;
}
}
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {
@Override
public void extractTerms(Set<Term> terms) {
// no-op
// This query is for abuse cases when the number of terms is too high to
// run efficiently as a BooleanQuery. So likewise we hide its terms in
// order to protect highlighters
}
/**
* On the given leaf context, try to either rewrite to a disjunction if
* there are few matching terms, or build a bitset containing matching docs.
*/
private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
final LeafReader reader = context.reader();
// We will first try to collect up to 'threshold' terms into 'matchingTerms'
// if there are two many terms, we will fall back to building the 'builder'
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
assert termData.size() > threshold : "Query should have been rewritten";
List<TermAndState> matchingTerms = new ArrayList<>(threshold);
DocIdSetBuilder builder = null;
final Fields fields = reader.fields();
String lastField = null;
Terms terms = null;
TermsEnum termsEnum = null;
PostingsEnum docs = null;
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
String field = iterator.field();
// comparing references is fine here
if (field != lastField) {
terms = fields.terms(field);
if (terms == null) {
termsEnum = null;
} else {
termsEnum = terms.iterator();
}
lastField = field;
}
if (termsEnum != null && termsEnum.seekExact(term)) {
if (matchingTerms == null) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
} else if (matchingTerms.size() < threshold) {
matchingTerms.add(new TermAndState(field, termsEnum));
} else {
assert matchingTerms.size() == threshold;
if (singleField) {
// common case: all terms are in the same field
// use an optimized builder that leverages terms stats to be more efficient
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
} else {
// corner case: different fields
// don't make assumptions about the docs we will get
builder = new DocIdSetBuilder(reader.maxDoc());
}
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
for (TermAndState t : matchingTerms) {
t.termsEnum.seekExact(t.term, t.state);
docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
}
matchingTerms = null;
}
}
}
if (matchingTerms != null) {
assert builder == null;
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (TermAndState t : matchingTerms) {
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD);
}
Query q = new ConstantScoreQuery(bq.build());
final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score());
return new WeightOrDocIdSet(weight);
} else {
assert builder != null;
return new WeightOrDocIdSet(builder.build());
}
}
private Scorer scorer(DocIdSet set) throws IOException {
if (set == null) {
return null;
}
final DocIdSetIterator disi = set.iterator();
if (disi == null) {
return null;
}
return new ConstantScoreScorer(this, score(), disi);
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.bulkScorer(context);
} else {
final Scorer scorer = scorer(weightOrBitSet.set);
if (scorer == null) {
return null;
}
return new DefaultBulkScorer(scorer);
}
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.scorer(context);
} else {
return scorer(weightOrBitSet.set);
}
}
};
}
}

View File

@ -50,9 +50,8 @@ public final class UsageTrackingQueryCachingPolicy implements QueryCachingPolicy
// DocIdSet in the first place
return query instanceof MultiTermQuery ||
query instanceof MultiTermQueryConstantScoreWrapper ||
isPointQuery(query) ||
// can't refer to TermsQuery directly as it is in another module
"TermsQuery".equals(query.getClass().getSimpleName());
query instanceof TermInSetQuery ||
isPointQuery(query);
}
static boolean isCheap(Query query) {

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.queries;
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
@ -26,6 +26,8 @@ import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
@ -40,16 +42,6 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryUtils;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.UsageTrackingQueryCachingPolicy;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@ -57,10 +49,7 @@ import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageTester;
import org.apache.lucene.util.TestUtil;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
public class TermsQueryTest extends LuceneTestCase {
public class TermInSetQueryTest extends LuceneTestCase {
public void testDuel() throws IOException {
final int iters = atLeast(2);
@ -107,7 +96,7 @@ public class TermsQueryTest extends LuceneTestCase {
bq.add(new TermQuery(t), Occur.SHOULD);
}
final Query q1 = new ConstantScoreQuery(bq.build());
final Query q2 = new TermsQuery(queryTerms);
final Query q2 = new TermInSetQuery(queryTerms);
assertSameMatches(searcher, new BoostQuery(q1, boost), new BoostQuery(q2, boost), true);
}
@ -129,15 +118,15 @@ public class TermsQueryTest extends LuceneTestCase {
}
}
private TermsQuery termsQuery(boolean singleField, Term...terms) {
private TermInSetQuery termsQuery(boolean singleField, Term...terms) {
return termsQuery(singleField, Arrays.asList(terms));
}
private TermsQuery termsQuery(boolean singleField, Collection<Term> termList) {
private TermInSetQuery termsQuery(boolean singleField, Collection<Term> termList) {
if (!singleField) {
return new TermsQuery(new ArrayList<>(termList));
return new TermInSetQuery(new ArrayList<>(termList));
}
final TermsQuery filter;
final TermInSetQuery filter;
List<BytesRef> bytes = new ArrayList<>();
String field = null;
for (Term term : termList) {
@ -148,7 +137,7 @@ public class TermsQueryTest extends LuceneTestCase {
field = term.field();
}
assertNotNull(field);
filter = new TermsQuery(field, bytes);
filter = new TermInSetQuery(field, bytes);
return filter;
}
@ -162,60 +151,60 @@ public class TermsQueryTest extends LuceneTestCase {
String string = TestUtil.randomRealisticUnicodeString(random());
terms.add(new Term(field, string));
uniqueTerms.add(new Term(field, string));
TermsQuery left = termsQuery(singleField ? random().nextBoolean() : false, uniqueTerms);
TermInSetQuery left = termsQuery(singleField ? random().nextBoolean() : false, uniqueTerms);
Collections.shuffle(terms, random());
TermsQuery right = termsQuery(singleField ? random().nextBoolean() : false, terms);
TermInSetQuery right = termsQuery(singleField ? random().nextBoolean() : false, terms);
assertEquals(right, left);
assertEquals(right.hashCode(), left.hashCode());
if (uniqueTerms.size() > 1) {
List<Term> asList = new ArrayList<>(uniqueTerms);
asList.remove(0);
TermsQuery notEqual = termsQuery(singleField ? random().nextBoolean() : false, asList);
TermInSetQuery notEqual = termsQuery(singleField ? random().nextBoolean() : false, asList);
assertFalse(left.equals(notEqual));
assertFalse(right.equals(notEqual));
}
}
TermsQuery tq1 = new TermsQuery(new Term("thing", "apple"));
TermsQuery tq2 = new TermsQuery(new Term("thing", "orange"));
TermInSetQuery tq1 = new TermInSetQuery(new Term("thing", "apple"));
TermInSetQuery tq2 = new TermInSetQuery(new Term("thing", "orange"));
assertFalse(tq1.hashCode() == tq2.hashCode());
// different fields with the same term should have differing hashcodes
tq1 = new TermsQuery(new Term("thing1", "apple"));
tq2 = new TermsQuery(new Term("thing2", "apple"));
tq1 = new TermInSetQuery(new Term("thing1", "apple"));
tq2 = new TermInSetQuery(new Term("thing2", "apple"));
assertFalse(tq1.hashCode() == tq2.hashCode());
}
public void testSingleFieldEquals() {
// Two terms with the same hash code
assertEquals("AaAaBB".hashCode(), "BBBBBB".hashCode());
TermsQuery left = termsQuery(true, new Term("id", "AaAaAa"), new Term("id", "AaAaBB"));
TermsQuery right = termsQuery(true, new Term("id", "AaAaAa"), new Term("id", "BBBBBB"));
TermInSetQuery left = termsQuery(true, new Term("id", "AaAaAa"), new Term("id", "AaAaBB"));
TermInSetQuery right = termsQuery(true, new Term("id", "AaAaAa"), new Term("id", "BBBBBB"));
assertFalse(left.equals(right));
}
public void testToString() {
TermsQuery termsQuery = new TermsQuery(new Term("field1", "a"),
TermInSetQuery termsQuery = new TermInSetQuery(new Term("field1", "a"),
new Term("field1", "b"),
new Term("field1", "c"));
assertEquals("field1:a field1:b field1:c", termsQuery.toString());
}
public void testDedup() {
Query query1 = new TermsQuery(new Term("foo", "bar"));
Query query2 = new TermsQuery(new Term("foo", "bar"), new Term("foo", "bar"));
Query query1 = new TermInSetQuery(new Term("foo", "bar"));
Query query2 = new TermInSetQuery(new Term("foo", "bar"), new Term("foo", "bar"));
QueryUtils.checkEqual(query1, query2);
}
public void testOrderDoesNotMatter() {
// order of terms if different
Query query1 = new TermsQuery(new Term("foo", "bar"), new Term("foo", "baz"));
Query query2 = new TermsQuery(new Term("foo", "baz"), new Term("foo", "bar"));
Query query1 = new TermInSetQuery(new Term("foo", "bar"), new Term("foo", "baz"));
Query query2 = new TermInSetQuery(new Term("foo", "baz"), new Term("foo", "bar"));
QueryUtils.checkEqual(query1, query2);
// order of fields is different
query1 = new TermsQuery(new Term("foo", "bar"), new Term("bar", "bar"));
query2 = new TermsQuery(new Term("bar", "bar"), new Term("foo", "bar"));
query1 = new TermInSetQuery(new Term("foo", "bar"), new Term("bar", "bar"));
query2 = new TermInSetQuery(new Term("bar", "bar"), new Term("foo", "bar"));
QueryUtils.checkEqual(query1, query2);
}
@ -225,7 +214,7 @@ public class TermsQueryTest extends LuceneTestCase {
for (int i = 0; i < numTerms; ++i) {
terms.add(new Term("f", RandomStrings.randomUnicodeOfLength(random(), 10)));
}
TermsQuery query = new TermsQuery(terms);
TermInSetQuery query = new TermInSetQuery(terms);
final long actualRamBytesUsed = RamUsageTester.sizeOf(query);
final long expectedRamBytesUsed = query.ramBytesUsed();
// error margin within 5%
@ -308,7 +297,7 @@ public class TermsQueryTest extends LuceneTestCase {
final List<Term> terms = new ArrayList<>();
final Set<String> fields = new HashSet<>();
// enough terms to avoid the rewrite
final int numTerms = TestUtil.nextInt(random(), TermsQuery.BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD + 1, 100);
final int numTerms = TestUtil.nextInt(random(), TermInSetQuery.BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD + 1, 100);
for (int i = 0; i < numTerms; ++i) {
final String field = RandomPicks.randomFrom(random(), new String[] {"foo", "bar", "baz"});
final BytesRef term = new BytesRef(RandomStrings.randomUnicodeOfCodepointLength(random(), 10));
@ -316,19 +305,19 @@ public class TermsQueryTest extends LuceneTestCase {
terms.add(new Term(field, term));
}
new IndexSearcher(wrapped).count(new TermsQuery(terms));
new IndexSearcher(wrapped).count(new TermInSetQuery(terms));
assertEquals(fields.size(), counter.get());
wrapped.close();
dir.close();
}
public void testBinaryToString() {
TermsQuery query = new TermsQuery(new Term("field", new BytesRef(new byte[] { (byte) 0xff, (byte) 0xfe })));
TermInSetQuery query = new TermInSetQuery(new Term("field", new BytesRef(new byte[] { (byte) 0xff, (byte) 0xfe })));
assertEquals("field:[ff fe]", query.toString());
}
public void testIsConsideredCostlyByQueryCache() throws IOException {
TermsQuery query = new TermsQuery(new Term("foo", "bar"), new Term("foo", "baz"));
TermInSetQuery query = new TermInSetQuery(new Term("foo", "bar"), new Term("foo", "baz"));
UsageTrackingQueryCachingPolicy policy = new UsageTrackingQueryCachingPolicy();
assertFalse(policy.shouldCache(query));
policy.onUse(query);

View File

@ -16,13 +16,13 @@
*/
package org.apache.lucene.facet;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.Query;
import java.util.ArrayList;
import java.util.Collection;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
/**
* A multi-terms {@link Query} over a {@link FacetField}.
* <p>
@ -30,9 +30,9 @@ import java.util.Collection;
* especially in cases where you don't intend to use {@link DrillSideways}
*
* @lucene.experimental
* @see org.apache.lucene.queries.TermsQuery
* @see org.apache.lucene.search.TermInSetQuery
*/
public class MultiFacetQuery extends TermsQuery {
public class MultiFacetQuery extends TermInSetQuery {
/**
* Creates a new {@code MultiFacetQuery} filtering the query on the given dimension.

View File

@ -16,130 +16,33 @@
*/
package org.apache.lucene.queries;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.PrefixCodedTerms;
import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Specialization for a disjunction over many terms that behaves like a
* {@link ConstantScoreQuery} over a {@link BooleanQuery} containing only
* {@link org.apache.lucene.search.BooleanClause.Occur#SHOULD} clauses.
* <p>For instance in the following example, both @{code q1} and {@code q2}
* would yield the same scores:
* <pre class="prettyprint">
* Query q1 = new TermsQuery(new Term("field", "foo"), new Term("field", "bar"));
*
* BooleanQuery bq = new BooleanQuery();
* bq.add(new TermQuery(new Term("field", "foo")), Occur.SHOULD);
* bq.add(new TermQuery(new Term("field", "bar")), Occur.SHOULD);
* Query q2 = new ConstantScoreQuery(bq);
* </pre>
* <p>When there are few terms, this query executes like a regular disjunction.
* However, when there are many terms, instead of merging iterators on the fly,
* it will populate a bit set with matching docs and return a {@link Scorer}
* over this bit set.
* <p>NOTE: This query produces scores that are equal to its boost
* @deprecated Use {@link org.apache.lucene.search.TermInSetQuery}
*/
public class TermsQuery extends Query implements Accountable {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermsQuery.class);
// Same threshold as MultiTermQueryConstantScoreWrapper
static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
private final boolean singleField; // whether all terms are from the same field
private final PrefixCodedTerms termData;
private final int termDataHashCode; // cached hashcode of termData
@Deprecated
public class TermsQuery extends TermInSetQuery {
/**
* Creates a new {@link TermsQuery} from the given collection. It
* can contain duplicate terms and multiple fields.
*/
public TermsQuery(Collection<Term> terms) {
Term[] sortedTerms = terms.toArray(new Term[terms.size()]);
// already sorted if we are a SortedSet with natural order
boolean sorted = terms instanceof SortedSet && ((SortedSet<Term>)terms).comparator() == null;
if (!sorted) {
ArrayUtil.timSort(sortedTerms);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
Set<String> fields = new HashSet<>();
Term previous = null;
for (Term term : sortedTerms) {
if (term.equals(previous) == false) {
fields.add(term.field());
builder.add(term);
}
previous = term;
}
singleField = fields.size() == 1;
termData = builder.finish();
termDataHashCode = termData.hashCode();
super(terms);
}
/**
* Creates a new {@link TermsQuery} from the given collection for
* a single field. It can contain duplicate terms.
*/
public TermsQuery(String field, Collection<BytesRef> terms) {
BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
// already sorted if we are a SortedSet with natural order
boolean sorted = terms instanceof SortedSet && ((SortedSet<BytesRef>)terms).comparator() == null;
if (!sorted) {
ArrayUtil.timSort(sortedTerms);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
BytesRefBuilder previous = null;
for (BytesRef term : sortedTerms) {
if (previous == null) {
previous = new BytesRefBuilder();
} else if (previous.get().equals(term)) {
continue; // deduplicate
}
builder.add(field, term);
previous.copyBytes(term);
}
singleField = true;
termData = builder.finish();
termDataHashCode = termData.hashCode();
super(field, terms);
}
/**
@ -147,7 +50,7 @@ public class TermsQuery extends Query implements Accountable {
* a single field.
*/
public TermsQuery(String field, BytesRef...terms) {
this(field, Arrays.asList(terms));
this(field, Arrays.asList(terms));
}
/**
@ -158,224 +61,5 @@ public class TermsQuery extends Query implements Accountable {
this(Arrays.asList(terms));
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
if (termData.size() <= threshold) {
BooleanQuery.Builder bq = new BooleanQuery.Builder();
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
bq.add(new TermQuery(new Term(iterator.field(), BytesRef.deepCopyOf(term))), Occur.SHOULD);
}
return new ConstantScoreQuery(bq.build());
}
return super.rewrite(reader);
}
@Override
public boolean equals(Object other) {
return sameClassAs(other) &&
equalsTo(getClass().cast(other));
}
private boolean equalsTo(TermsQuery other) {
// termData might be heavy to compare so check the hash code first
return termDataHashCode == other.termDataHashCode &&
termData.equals(other.termData);
}
@Override
public int hashCode() {
return 31 * classHash() + termDataHashCode;
}
/** Returns the terms wrapped in a PrefixCodedTerms. */
public PrefixCodedTerms getTermData() {
return termData;
}
@Override
public String toString(String defaultField) {
StringBuilder builder = new StringBuilder();
boolean first = true;
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
if (!first) {
builder.append(' ');
}
first = false;
builder.append(new Term(iterator.field(), term).toString());
}
return builder.toString();
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED + termData.ramBytesUsed();
}
@Override
public Collection<Accountable> getChildResources() {
return Collections.emptyList();
}
private static class TermAndState {
final String field;
final TermsEnum termsEnum;
final BytesRef term;
final TermState state;
final int docFreq;
final long totalTermFreq;
TermAndState(String field, TermsEnum termsEnum) throws IOException {
this.field = field;
this.termsEnum = termsEnum;
this.term = BytesRef.deepCopyOf(termsEnum.term());
this.state = termsEnum.termState();
this.docFreq = termsEnum.docFreq();
this.totalTermFreq = termsEnum.totalTermFreq();
}
}
private static class WeightOrDocIdSet {
final Weight weight;
final DocIdSet set;
WeightOrDocIdSet(Weight weight) {
this.weight = Objects.requireNonNull(weight);
this.set = null;
}
WeightOrDocIdSet(DocIdSet bitset) {
this.set = bitset;
this.weight = null;
}
}
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
return new ConstantScoreWeight(this, boost) {
@Override
public void extractTerms(Set<Term> terms) {
// no-op
// This query is for abuse cases when the number of terms is too high to
// run efficiently as a BooleanQuery. So likewise we hide its terms in
// order to protect highlighters
}
/**
* On the given leaf context, try to either rewrite to a disjunction if
* there are few matching terms, or build a bitset containing matching docs.
*/
private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
final LeafReader reader = context.reader();
// We will first try to collect up to 'threshold' terms into 'matchingTerms'
// if there are two many terms, we will fall back to building the 'builder'
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
assert termData.size() > threshold : "Query should have been rewritten";
List<TermAndState> matchingTerms = new ArrayList<>(threshold);
DocIdSetBuilder builder = null;
final Fields fields = reader.fields();
String lastField = null;
Terms terms = null;
TermsEnum termsEnum = null;
PostingsEnum docs = null;
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
String field = iterator.field();
// comparing references is fine here
if (field != lastField) {
terms = fields.terms(field);
if (terms == null) {
termsEnum = null;
} else {
termsEnum = terms.iterator();
}
lastField = field;
}
if (termsEnum != null && termsEnum.seekExact(term)) {
if (matchingTerms == null) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
} else if (matchingTerms.size() < threshold) {
matchingTerms.add(new TermAndState(field, termsEnum));
} else {
assert matchingTerms.size() == threshold;
if (singleField) {
// common case: all terms are in the same field
// use an optimized builder that leverages terms stats to be more efficient
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
} else {
// corner case: different fields
// don't make assumptions about the docs we will get
builder = new DocIdSetBuilder(reader.maxDoc());
}
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
for (TermAndState t : matchingTerms) {
t.termsEnum.seekExact(t.term, t.state);
docs = t.termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
}
matchingTerms = null;
}
}
}
if (matchingTerms != null) {
assert builder == null;
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (TermAndState t : matchingTerms) {
final TermContext termContext = new TermContext(searcher.getTopReaderContext());
termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq);
bq.add(new TermQuery(new Term(t.field, t.term), termContext), Occur.SHOULD);
}
Query q = new ConstantScoreQuery(bq.build());
final Weight weight = searcher.rewrite(q).createWeight(searcher, needsScores, score());
return new WeightOrDocIdSet(weight);
} else {
assert builder != null;
return new WeightOrDocIdSet(builder.build());
}
}
private Scorer scorer(DocIdSet set) throws IOException {
if (set == null) {
return null;
}
final DocIdSetIterator disi = set.iterator();
if (disi == null) {
return null;
}
return new ConstantScoreScorer(this, score(), disi);
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.bulkScorer(context);
} else {
final Scorer scorer = scorer(weightOrBitSet.set);
if (scorer == null) {
return null;
}
return new DefaultBulkScorer(scorer);
}
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.scorer(context);
} else {
return scorer(weightOrBitSet.set);
}
}
};
}
}

View File

@ -19,10 +19,8 @@ package org.apache.lucene.spatial.prefix;
import java.util.ArrayList;
import java.util.List;
import org.locationtech.spatial4j.shape.Point;
import org.locationtech.spatial4j.shape.Shape;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.spatial.prefix.tree.Cell;
import org.apache.lucene.spatial.prefix.tree.CellIterator;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
@ -31,10 +29,12 @@ import org.apache.lucene.spatial.query.SpatialOperation;
import org.apache.lucene.spatial.query.UnsupportedSpatialOperation;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.locationtech.spatial4j.shape.Point;
import org.locationtech.spatial4j.shape.Shape;
/**
* A basic implementation of {@link PrefixTreeStrategy} using a large
* {@link TermsQuery} of all the cells from
* {@link TermInSetQuery} of all the cells from
* {@link SpatialPrefixTree#getTreeCellIterator(org.locationtech.spatial4j.shape.Shape, int)}.
* It only supports the search of indexed Point shapes.
* <p>
@ -105,7 +105,7 @@ public class TermQueryPrefixTreeStrategy extends PrefixTreeStrategy {
}
//unfortunately TermsQuery will needlessly sort & dedupe
//TODO an automatonQuery might be faster?
return new TermsQuery(getFieldName(), terms);
return new TermInSetQuery(getFieldName(), terms);
}
}

View File

@ -23,12 +23,11 @@ import java.util.Collections;
import java.util.List;
import com.carrotsearch.randomizedtesting.annotations.Repeat;
import org.locationtech.spatial4j.shape.Shape;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.spatial.StrategyTestCase;
import org.apache.lucene.spatial.prefix.NumberRangePrefixTreeStrategy.Facets;
import org.apache.lucene.spatial.prefix.tree.Cell;
@ -40,6 +39,7 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.junit.Before;
import org.junit.Test;
import org.locationtech.spatial4j.shape.Shape;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomInt;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
@ -132,7 +132,7 @@ public class NumberRangeFacetsTest extends StrategyTestCase {
terms.add(new Term("id", acceptDocId.toString()));
}
topAcceptDocs = searchForDocBits(new TermsQuery(terms));
topAcceptDocs = searchForDocBits(new TermInSetQuery(terms));
}
}

View File

@ -26,6 +26,15 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import com.carrotsearch.hppc.IntHashSet;
import com.carrotsearch.hppc.IntObjectHashMap;
import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.LongObjectHashMap;
import com.carrotsearch.hppc.LongObjectMap;
import com.carrotsearch.hppc.cursors.IntObjectCursor;
import com.carrotsearch.hppc.cursors.LongCursor;
import com.carrotsearch.hppc.cursors.LongObjectCursor;
import com.carrotsearch.hppc.cursors.ObjectCursor;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
@ -36,7 +45,6 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
@ -46,6 +54,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopFieldCollector;
@ -83,16 +92,6 @@ import org.apache.solr.uninverting.UninvertingReader;
import org.apache.solr.util.plugin.PluginInfoInitialized;
import org.apache.solr.util.plugin.SolrCoreAware;
import com.carrotsearch.hppc.IntHashSet;
import com.carrotsearch.hppc.IntObjectHashMap;
import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.LongObjectHashMap;
import com.carrotsearch.hppc.LongObjectMap;
import com.carrotsearch.hppc.cursors.IntObjectCursor;
import com.carrotsearch.hppc.cursors.LongCursor;
import com.carrotsearch.hppc.cursors.LongObjectCursor;
import com.carrotsearch.hppc.cursors.ObjectCursor;
/**
* The ExpandComponent is designed to work with the CollapsingPostFilter.
* The CollapsingPostFilter collapses a result set on a field.
@ -720,7 +719,7 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia
bytesRefs[++index] = term.toBytesRef();
}
return new SolrConstantScoreQuery(new QueryWrapperFilter(new TermsQuery(fname, bytesRefs)));
return new SolrConstantScoreQuery(new QueryWrapperFilter(new TermInSetQuery(fname, bytesRefs)));
}
private Query getGroupQuery(String fname,
@ -733,7 +732,7 @@ public class ExpandComponent extends SearchComponent implements PluginInfoInitia
IntObjectCursor<BytesRef> cursor = it.next();
bytesRefs[++index] = cursor.value;
}
return new SolrConstantScoreQuery(new QueryWrapperFilter(new TermsQuery(fname, bytesRefs)));
return new SolrConstantScoreQuery(new QueryWrapperFilter(new TermInSetQuery(fname, bytesRefs)));
}

View File

@ -39,7 +39,6 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.legacy.LegacyNumericType;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@ -51,6 +50,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSelector;
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.BytesRef;
@ -765,7 +765,7 @@ public abstract class FieldType extends FieldProperties {
readableToIndexed(externalVal, br);
lst.add( br.toBytesRef() );
}
return new TermsQuery(field.getName() , lst);
return new TermInSetQuery(field.getName() , lst);
}
/**

View File

@ -20,13 +20,13 @@ import java.util.Arrays;
import java.util.regex.Pattern;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocValuesTermsQuery;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
@ -60,7 +60,7 @@ public class TermsQParserPlugin extends QParserPlugin {
termsFilter {
@Override
Filter makeFilter(String fname, BytesRef[] bytesRefs) {
return new QueryWrapperFilter(new TermsQuery(fname, bytesRefs));
return new QueryWrapperFilter(new TermInSetQuery(fname, bytesRefs));
}
},
booleanQuery {

View File

@ -25,7 +25,6 @@ import java.util.TreeSet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
@ -35,6 +34,7 @@ import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
@ -281,7 +281,7 @@ public class GraphQuery extends Query {
collectorTerms.get(i, ref);
termList.add(ref);
}
q = new TermsQuery(fromField, termList);
q = new TermInSetQuery(fromField, termList);
}
// If there is a filter to be used while crawling the graph, add that.

View File

@ -19,12 +19,12 @@ package org.apache.solr.search;
import java.util.Locale;
import java.util.Random;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.core.SolrInfoMBean;
@ -224,13 +224,13 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
qParser = QParser.getParser("foo_s:(a b c d e f g h i j k l m n o p q r s t u v w x y z)", req);
qParser.setIsFilter(true); // this may change in the future
q = qParser.getQuery();
assertEquals(26, ((TermsQuery)q).getTermData().size());
assertEquals(26, ((TermInSetQuery)q).getTermData().size());
// large numeric filter query should use TermsQuery (for trie fields)
qParser = QParser.getParser("foo_i:(1 2 3 4 5 6 7 8 9 10 20 19 18 17 16 15 14 13 12 11)", req);
qParser.setIsFilter(true); // this may change in the future
q = qParser.getQuery();
assertEquals(20, ((TermsQuery)q).getTermData().size());
assertEquals(20, ((TermInSetQuery)q).getTermData().size());
// a filter() clause inside a relevancy query should be able to use a TermsQuery
qParser = QParser.getParser("foo_s:aaa filter(foo_s:(a b c d e f g h i j k l m n o p q r s t u v w x y z))", req);
@ -245,7 +245,7 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
qq = ((FilterQuery)qq).getQuery();
}
assertEquals(26, ((TermsQuery)qq).getTermData().size());
assertEquals(26, ((TermInSetQuery)qq).getTermData().size());
// test mixed boolean query, including quotes (which shouldn't matter)
qParser = QParser.getParser("foo_s:(a +aaa b -bbb c d e f bar_s:(qqq www) g h i j k l m n o p q r s t u v w x y z)", req);
@ -255,9 +255,9 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
qq = null;
for (BooleanClause clause : ((BooleanQuery)q).clauses()) {
qq = clause.getQuery();
if (qq instanceof TermsQuery) break;
if (qq instanceof TermInSetQuery) break;
}
assertEquals(26, ((TermsQuery)qq).getTermData().size());
assertEquals(26, ((TermInSetQuery)qq).getTermData().size());
req.close();
}