LUCENE-7637: Require that all terms of a TermsQuery come from the same field.

This commit is contained in:
Adrien Grand 2017-01-17 08:45:28 +01:00
parent 86233cb95d
commit 43874fc5b5
5 changed files with 84 additions and 164 deletions

View File

@ -63,6 +63,14 @@ Other
======================= Lucene 6.5.0 =======================
API Changes
* LUCENE-7624: TermsQuery has been renamed as TermInSetQuery and moved to core.
(Alan Woodward)
* LUCENE-7637: TermInSetQuery requires that all terms come from the same field.
(Adrien Grand)
New Features
* LUCENE-7623: Add FunctionScoreQuery and FunctionMatchQuery (Alan Woodward,
@ -107,9 +115,6 @@ API Changes
* LUCENE-7611: DocumentValueSourceDictionary now takes a LongValuesSource
as a parameter, and the ValueSource equivalent is deprecated (Alan Woodward)
* LUCENE-7624: TermsQuery has been renamed as TermInSetQuery and moved to core.
(Alan Woodward)
New features
* LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand)

View File

@ -21,7 +21,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
@ -73,39 +72,12 @@ public class TermInSetQuery extends Query implements Accountable {
// Same threshold as MultiTermQueryConstantScoreWrapper
static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16;
private final boolean singleField; // whether all terms are from the same field
private final String field;
private final PrefixCodedTerms termData;
private final int termDataHashCode; // cached hashcode of termData
/**
* Creates a new {@link TermInSetQuery} from the given collection. It
* can contain duplicate terms and multiple fields.
*/
public TermInSetQuery(Collection<Term> terms) {
Term[] sortedTerms = terms.toArray(new Term[terms.size()]);
// already sorted if we are a SortedSet with natural order
boolean sorted = terms instanceof SortedSet && ((SortedSet<Term>)terms).comparator() == null;
if (!sorted) {
ArrayUtil.timSort(sortedTerms);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
Set<String> fields = new HashSet<>();
Term previous = null;
for (Term term : sortedTerms) {
if (term.equals(previous) == false) {
fields.add(term.field());
builder.add(term);
}
previous = term;
}
singleField = fields.size() == 1;
termData = builder.finish();
termDataHashCode = termData.hashCode();
}
/**
* Creates a new {@link TermInSetQuery} from the given collection for
* a single field. It can contain duplicate terms.
* Creates a new {@link TermInSetQuery} from the given collection of terms.
*/
public TermInSetQuery(String field, Collection<BytesRef> terms) {
BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
@ -125,27 +97,18 @@ public class TermInSetQuery extends Query implements Accountable {
builder.add(field, term);
previous.copyBytes(term);
}
singleField = true;
this.field = field;
termData = builder.finish();
termDataHashCode = termData.hashCode();
}
/**
* Creates a new {@link TermInSetQuery} from the given {@link BytesRef} array for
* a single field.
* Creates a new {@link TermInSetQuery} from the given array of terms.
*/
public TermInSetQuery(String field, BytesRef...terms) {
this(field, Arrays.asList(terms));
}
/**
* Creates a new {@link TermInSetQuery} from the given array. The array can
* contain duplicate terms and multiple fields.
*/
public TermInSetQuery(final Term... terms) {
this(Arrays.asList(terms));
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
@ -167,6 +130,7 @@ public class TermInSetQuery extends Query implements Accountable {
}
private boolean equalsTo(TermInSetQuery other) {
// no need to check 'field' explicitly since it is encoded in 'termData'
// termData might be heavy to compare so check the hash code first
return termDataHashCode == other.termDataHashCode &&
termData.equals(other.termData);
@ -260,6 +224,15 @@ public class TermInSetQuery extends Query implements Accountable {
private WeightOrDocIdSet rewrite(LeafReaderContext context) throws IOException {
final LeafReader reader = context.reader();
final Fields fields = reader.fields();
Terms terms = fields.terms(field);
if (terms == null) {
return null;
}
TermsEnum termsEnum = terms.iterator();
PostingsEnum docs = null;
TermIterator iterator = termData.iterator();
// We will first try to collect up to 'threshold' terms into 'matchingTerms'
// if there are two many terms, we will fall back to building the 'builder'
final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount());
@ -267,25 +240,9 @@ public class TermInSetQuery extends Query implements Accountable {
List<TermAndState> matchingTerms = new ArrayList<>(threshold);
DocIdSetBuilder builder = null;
final Fields fields = reader.fields();
String lastField = null;
Terms terms = null;
TermsEnum termsEnum = null;
PostingsEnum docs = null;
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
String field = iterator.field();
// comparing references is fine here
if (field != lastField) {
terms = fields.terms(field);
if (terms == null) {
termsEnum = null;
} else {
termsEnum = terms.iterator();
}
lastField = field;
}
if (termsEnum != null && termsEnum.seekExact(term)) {
assert field.equals(iterator.field());
if (termsEnum.seekExact(term)) {
if (matchingTerms == null) {
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
@ -293,15 +250,7 @@ public class TermInSetQuery extends Query implements Accountable {
matchingTerms.add(new TermAndState(field, termsEnum));
} else {
assert matchingTerms.size() == threshold;
if (singleField) {
// common case: all terms are in the same field
// use an optimized builder that leverages terms stats to be more efficient
builder = new DocIdSetBuilder(reader.maxDoc(), terms);
} else {
// corner case: different fields
// don't make assumptions about the docs we will get
builder = new DocIdSetBuilder(reader.maxDoc());
}
docs = termsEnum.postings(docs, PostingsEnum.NONE);
builder.add(docs);
for (TermAndState t : matchingTerms) {
@ -344,7 +293,9 @@ public class TermInSetQuery extends Query implements Accountable {
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet.weight != null) {
if (weightOrBitSet == null) {
return null;
} else if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.bulkScorer(context);
} else {
final Scorer scorer = scorer(weightOrBitSet.set);
@ -358,7 +309,9 @@ public class TermInSetQuery extends Query implements Accountable {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
final WeightOrDocIdSet weightOrBitSet = rewrite(context);
if (weightOrBitSet.weight != null) {
if (weightOrBitSet == null) {
return null;
} else if (weightOrBitSet.weight != null) {
return weightOrBitSet.weight.scorer(context);
} else {
return scorer(weightOrBitSet.set);

View File

@ -18,15 +18,12 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
@ -53,25 +50,25 @@ public class TermInSetQueryTest extends LuceneTestCase {
public void testDuel() throws IOException {
final int iters = atLeast(2);
final String field = "f";
for (int iter = 0; iter < iters; ++iter) {
final List<Term> allTerms = new ArrayList<>();
final List<BytesRef> allTerms = new ArrayList<>();
final int numTerms = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 1, 10));
for (int i = 0; i < numTerms; ++i) {
final String field = usually() ? "f" : "g";
final String value = TestUtil.randomAnalysisString(random(), 10, true);
allTerms.add(new Term(field, value));
allTerms.add(new BytesRef(value));
}
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
final int numDocs = atLeast(100);
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
final Term term = allTerms.get(random().nextInt(allTerms.size()));
doc.add(new StringField(term.field(), term.text(), Store.NO));
final BytesRef term = allTerms.get(random().nextInt(allTerms.size()));
doc.add(new StringField(field, term, Store.NO));
iw.addDocument(doc);
}
if (numTerms > 1 && random().nextBoolean()) {
iw.deleteDocuments(new TermQuery(allTerms.get(0)));
iw.deleteDocuments(new TermQuery(new Term(field, allTerms.get(0))));
}
iw.commit();
final IndexReader reader = iw.getReader();
@ -87,16 +84,16 @@ public class TermInSetQueryTest extends LuceneTestCase {
for (int i = 0; i < 100; ++i) {
final float boost = random().nextFloat() * 10;
final int numQueryTerms = TestUtil.nextInt(random(), 1, 1 << TestUtil.nextInt(random(), 1, 8));
List<Term> queryTerms = new ArrayList<>();
List<BytesRef> queryTerms = new ArrayList<>();
for (int j = 0; j < numQueryTerms; ++j) {
queryTerms.add(allTerms.get(random().nextInt(allTerms.size())));
}
final BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (Term t : queryTerms) {
bq.add(new TermQuery(t), Occur.SHOULD);
for (BytesRef t : queryTerms) {
bq.add(new TermQuery(new Term(field, t)), Occur.SHOULD);
}
final Query q1 = new ConstantScoreQuery(bq.build());
final Query q2 = new TermInSetQuery(queryTerms);
final Query q2 = new TermInSetQuery(field, queryTerms);
assertSameMatches(searcher, new BoostQuery(q1, boost), new BoostQuery(q2, boost), true);
}
@ -118,103 +115,72 @@ public class TermInSetQueryTest extends LuceneTestCase {
}
}
private TermInSetQuery termsQuery(boolean singleField, Term...terms) {
return termsQuery(singleField, Arrays.asList(terms));
}
private TermInSetQuery termsQuery(boolean singleField, Collection<Term> termList) {
if (!singleField) {
return new TermInSetQuery(new ArrayList<>(termList));
}
final TermInSetQuery filter;
List<BytesRef> bytes = new ArrayList<>();
String field = null;
for (Term term : termList) {
bytes.add(term.bytes());
if (field != null) {
assertEquals(term.field(), field);
}
field = term.field();
}
assertNotNull(field);
filter = new TermInSetQuery(field, bytes);
return filter;
}
public void testHashCodeAndEquals() {
int num = atLeast(100);
final boolean singleField = random().nextBoolean();
List<Term> terms = new ArrayList<>();
Set<Term> uniqueTerms = new HashSet<>();
List<BytesRef> terms = new ArrayList<>();
Set<BytesRef> uniqueTerms = new HashSet<>();
for (int i = 0; i < num; i++) {
String field = "field" + (singleField ? "1" : random().nextInt(100));
String string = TestUtil.randomRealisticUnicodeString(random());
terms.add(new Term(field, string));
uniqueTerms.add(new Term(field, string));
TermInSetQuery left = termsQuery(singleField ? random().nextBoolean() : false, uniqueTerms);
terms.add(new BytesRef(string));
uniqueTerms.add(new BytesRef(string));
TermInSetQuery left = new TermInSetQuery("field", uniqueTerms);
Collections.shuffle(terms, random());
TermInSetQuery right = termsQuery(singleField ? random().nextBoolean() : false, terms);
TermInSetQuery right = new TermInSetQuery("field", terms);
assertEquals(right, left);
assertEquals(right.hashCode(), left.hashCode());
if (uniqueTerms.size() > 1) {
List<Term> asList = new ArrayList<>(uniqueTerms);
List<BytesRef> asList = new ArrayList<>(uniqueTerms);
asList.remove(0);
TermInSetQuery notEqual = termsQuery(singleField ? random().nextBoolean() : false, asList);
TermInSetQuery notEqual = new TermInSetQuery("field", asList);
assertFalse(left.equals(notEqual));
assertFalse(right.equals(notEqual));
}
}
TermInSetQuery tq1 = new TermInSetQuery(new Term("thing", "apple"));
TermInSetQuery tq2 = new TermInSetQuery(new Term("thing", "orange"));
TermInSetQuery tq1 = new TermInSetQuery("thing", new BytesRef("apple"));
TermInSetQuery tq2 = new TermInSetQuery("thing", new BytesRef("orange"));
assertFalse(tq1.hashCode() == tq2.hashCode());
// different fields with the same term should have differing hashcodes
tq1 = new TermInSetQuery(new Term("thing1", "apple"));
tq2 = new TermInSetQuery(new Term("thing2", "apple"));
tq1 = new TermInSetQuery("thing", new BytesRef("apple"));
tq2 = new TermInSetQuery("thing2", new BytesRef("apple"));
assertFalse(tq1.hashCode() == tq2.hashCode());
}
public void testSingleFieldEquals() {
public void testSimpleEquals() {
// Two terms with the same hash code
assertEquals("AaAaBB".hashCode(), "BBBBBB".hashCode());
TermInSetQuery left = termsQuery(true, new Term("id", "AaAaAa"), new Term("id", "AaAaBB"));
TermInSetQuery right = termsQuery(true, new Term("id", "AaAaAa"), new Term("id", "BBBBBB"));
TermInSetQuery left = new TermInSetQuery("id", new BytesRef("AaAaAa"), new BytesRef("AaAaBB"));
TermInSetQuery right = new TermInSetQuery("id", new BytesRef("AaAaAa"), new BytesRef("BBBBBB"));
assertFalse(left.equals(right));
}
public void testToString() {
TermInSetQuery termsQuery = new TermInSetQuery(new Term("field1", "a"),
new Term("field1", "b"),
new Term("field1", "c"));
TermInSetQuery termsQuery = new TermInSetQuery("field1",
new BytesRef("a"), new BytesRef("b"), new BytesRef("c"));
assertEquals("field1:a field1:b field1:c", termsQuery.toString());
}
public void testDedup() {
Query query1 = new TermInSetQuery(new Term("foo", "bar"));
Query query2 = new TermInSetQuery(new Term("foo", "bar"), new Term("foo", "bar"));
Query query1 = new TermInSetQuery("foo", new BytesRef("bar"));
Query query2 = new TermInSetQuery("foo", new BytesRef("bar"), new BytesRef("bar"));
QueryUtils.checkEqual(query1, query2);
}
public void testOrderDoesNotMatter() {
// order of terms if different
Query query1 = new TermInSetQuery(new Term("foo", "bar"), new Term("foo", "baz"));
Query query2 = new TermInSetQuery(new Term("foo", "baz"), new Term("foo", "bar"));
QueryUtils.checkEqual(query1, query2);
// order of fields is different
query1 = new TermInSetQuery(new Term("foo", "bar"), new Term("bar", "bar"));
query2 = new TermInSetQuery(new Term("bar", "bar"), new Term("foo", "bar"));
Query query1 = new TermInSetQuery("foo", new BytesRef("bar"), new BytesRef("baz"));
Query query2 = new TermInSetQuery("foo", new BytesRef("baz"), new BytesRef("bar"));
QueryUtils.checkEqual(query1, query2);
}
public void testRamBytesUsed() {
List<Term> terms = new ArrayList<>();
List<BytesRef> terms = new ArrayList<>();
final int numTerms = 1000 + random().nextInt(1000);
for (int i = 0; i < numTerms; ++i) {
terms.add(new Term("f", RandomStrings.randomUnicodeOfLength(random(), 10)));
terms.add(new BytesRef(RandomStrings.randomUnicodeOfLength(random(), 10)));
}
TermInSetQuery query = new TermInSetQuery(terms);
TermInSetQuery query = new TermInSetQuery("f", terms);
final long actualRamBytesUsed = RamUsageTester.sizeOf(query);
final long expectedRamBytesUsed = query.ramBytesUsed();
// error margin within 5%
@ -281,43 +247,40 @@ public class TermInSetQueryTest extends LuceneTestCase {
}
public void testPullOneTermsEnumPerField() throws Exception {
public void testPullOneTermsEnum() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new StringField("foo", "1", Store.NO));
doc.add(new StringField("bar", "2", Store.NO));
doc.add(new StringField("baz", "3", Store.NO));
w.addDocument(doc);
DirectoryReader reader = w.getReader();
w.close();
final AtomicInteger counter = new AtomicInteger();
DirectoryReader wrapped = new TermsCountingDirectoryReaderWrapper(reader, counter);
final List<Term> terms = new ArrayList<>();
final Set<String> fields = new HashSet<>();
final List<BytesRef> terms = new ArrayList<>();
// enough terms to avoid the rewrite
final int numTerms = TestUtil.nextInt(random(), TermInSetQuery.BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD + 1, 100);
for (int i = 0; i < numTerms; ++i) {
final String field = RandomPicks.randomFrom(random(), new String[] {"foo", "bar", "baz"});
final BytesRef term = new BytesRef(RandomStrings.randomUnicodeOfCodepointLength(random(), 10));
fields.add(field);
terms.add(new Term(field, term));
terms.add(term);
}
new IndexSearcher(wrapped).count(new TermInSetQuery(terms));
assertEquals(fields.size(), counter.get());
assertEquals(0, new IndexSearcher(wrapped).count(new TermInSetQuery("bar", terms)));
assertEquals(0, counter.get()); // missing field
new IndexSearcher(wrapped).count(new TermInSetQuery("foo", terms));
assertEquals(1, counter.get());
wrapped.close();
dir.close();
}
public void testBinaryToString() {
TermInSetQuery query = new TermInSetQuery(new Term("field", new BytesRef(new byte[] { (byte) 0xff, (byte) 0xfe })));
TermInSetQuery query = new TermInSetQuery("field", new BytesRef(new byte[] { (byte) 0xff, (byte) 0xfe }));
assertEquals("field:[ff fe]", query.toString());
}
public void testIsConsideredCostlyByQueryCache() throws IOException {
TermInSetQuery query = new TermInSetQuery(new Term("foo", "bar"), new Term("foo", "baz"));
TermInSetQuery query = new TermInSetQuery("foo", new BytesRef("bar"), new BytesRef("baz"));
UsageTrackingQueryCachingPolicy policy = new UsageTrackingQueryCachingPolicy();
assertFalse(policy.shouldCache(query));
policy.onUse(query);

View File

@ -19,9 +19,9 @@ package org.apache.lucene.facet;
import java.util.ArrayList;
import java.util.Collection;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.util.BytesRef;
/**
* A multi-terms {@link Query} over a {@link FacetField}.
@ -38,7 +38,7 @@ public class MultiFacetQuery extends TermInSetQuery {
* Creates a new {@code MultiFacetQuery} filtering the query on the given dimension.
*/
public MultiFacetQuery(final FacetsConfig facetsConfig, final String dimension, final String[]... paths) {
super(toTerms(facetsConfig.getDimConfig(dimension), dimension, paths));
super(facetsConfig.getDimConfig(dimension).indexFieldName, toTerms(dimension, paths));
}
/**
@ -47,14 +47,13 @@ public class MultiFacetQuery extends TermInSetQuery {
* <b>NOTE:</b>Uses FacetsConfig.DEFAULT_DIM_CONFIG.
*/
public MultiFacetQuery(final String dimension, final String[]... paths) {
super(toTerms(FacetsConfig.DEFAULT_DIM_CONFIG, dimension, paths));
super(FacetsConfig.DEFAULT_DIM_CONFIG.indexFieldName, toTerms(dimension, paths));
}
static Collection<Term> toTerms(final FacetsConfig.DimConfig dimConfig, final String dimension,
final String[]... paths) {
final Collection<Term> terms = new ArrayList<>(paths.length);
static Collection<BytesRef> toTerms(final String dimension, final String[]... paths) {
final Collection<BytesRef> terms = new ArrayList<>(paths.length);
for (String[] path : paths)
terms.add(FacetQuery.toTerm(dimConfig, dimension, path));
terms.add(new BytesRef(FacetsConfig.pathToString(dimension, path)));
return terms;
}

View File

@ -24,7 +24,6 @@ import java.util.List;
import com.carrotsearch.randomizedtesting.annotations.Repeat;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.TermInSetQuery;
@ -36,6 +35,7 @@ import org.apache.lucene.spatial.prefix.tree.DateRangePrefixTree;
import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree;
import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.junit.Before;
import org.junit.Test;
@ -127,12 +127,12 @@ public class NumberRangeFacetsTest extends StrategyTestCase {
Collections.shuffle(acceptFieldIds, random());
acceptFieldIds = acceptFieldIds.subList(0, randomInt(acceptFieldIds.size()));
if (!acceptFieldIds.isEmpty()) {
List<Term> terms = new ArrayList<>();
List<BytesRef> terms = new ArrayList<>();
for (Integer acceptDocId : acceptFieldIds) {
terms.add(new Term("id", acceptDocId.toString()));
terms.add(new BytesRef(acceptDocId.toString()));
}
topAcceptDocs = searchForDocBits(new TermInSetQuery(terms));
topAcceptDocs = searchForDocBits(new TermInSetQuery("id", terms));
}
}