Add field as a separate input to newSynonymQuery (#11941)

QueryBuilder#newSynonymQuery takes an array of TermAndBoost objects as a
parameter and uses the field of the first term in the array as its field. However,
there are cases where this array may be empty, which will result in an
ArrayOutOfBoundsException.

This commit reworks QueryBuilder so that TermAndBoost contains plain
BytesRefs, and passes the field as a separate parameter. This guards against
accidental calls to newSynonymQuery with an empty list - in this case, an
empty synonym query is generated rather than an exception. It also
refactors SynonymQuery itself to hold BytesRefs rather than Terms, which
needlessly repeat the field for each entry.

Fixes #11864
This commit is contained in:
Alan Woodward 2022-11-21 09:55:14 +00:00 committed by GitHub
parent cd82a9bbdc
commit 332679886c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 41 deletions

View File

@ -111,6 +111,10 @@ API Changes
memory to provide stronger guarantees on query latency.
(Adrien Grand, Uwe Schindler)
* GITHUB#11941: QueryBuilder#add and #newSynonymQuery methods now take a `field` parameter,
to avoid possible exceptions when building queries from an empty term list. The helper
TermAndBoost class now holds a BytesRef rather than a Term. (Alan Woodward)
New Features
---------------------
* GITHUB#11795: Add ByteWritesTrackingDirectoryWrapper to expose metrics for bytes merged, flushed, and overall

View File

@ -80,6 +80,14 @@ public final class SynonymQuery extends Query {
if (field.equals(term.field()) == false) {
throw new IllegalArgumentException("Synonyms must be across the same field");
}
return addTerm(term.bytes(), boost);
}
/**
* Adds the provided {@code term} as a synonym, document frequencies of this term will be
* boosted by {@code boost}.
*/
public Builder addTerm(BytesRef term, float boost) {
if (Float.isNaN(boost) || Float.compare(boost, 0f) <= 0 || Float.compare(boost, 1f) > 0) {
throw new IllegalArgumentException(
"boost must be a positive float between 0 (exclusive) and 1 (inclusive)");
@ -110,7 +118,7 @@ public final class SynonymQuery extends Query {
public List<Term> getTerms() {
return Collections.unmodifiableList(
Arrays.stream(terms).map(TermAndBoost::getTerm).collect(Collectors.toList()));
Arrays.stream(terms).map(t -> new Term(field, t.term)).collect(Collectors.toList()));
}
@Override
@ -120,7 +128,7 @@ public final class SynonymQuery extends Query {
if (i != 0) {
builder.append(" ");
}
Query termQuery = new TermQuery(terms[i].term);
Query termQuery = new TermQuery(new Term(this.field, terms[i].term));
builder.append(termQuery.toString(field));
if (terms[i].boost != 1f) {
builder.append("^");
@ -148,7 +156,7 @@ public final class SynonymQuery extends Query {
return new BooleanQuery.Builder().build();
}
if (terms.length == 1 && terms[0].boost == 1f) {
return new TermQuery(terms[0].term);
return new TermQuery(new Term(field, terms[0].term));
}
return this;
}
@ -159,7 +167,7 @@ public final class SynonymQuery extends Query {
return;
}
QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.SHOULD, this);
Term[] ts = Arrays.stream(terms).map(t -> t.term).toArray(Term[]::new);
Term[] ts = Arrays.stream(terms).map(t -> new Term(field, t.term)).toArray(Term[]::new);
v.consumeTerms(this, ts);
}
@ -172,7 +180,7 @@ public final class SynonymQuery extends Query {
// if scores are not needed, let BooleanWeight deal with optimizing that case.
BooleanQuery.Builder bq = new BooleanQuery.Builder();
for (TermAndBoost term : terms) {
bq.add(new TermQuery(term.term), BooleanClause.Occur.SHOULD);
bq.add(new TermQuery(new Term(field, term.term)), BooleanClause.Occur.SHOULD);
}
return searcher
.rewrite(bq.build())
@ -191,16 +199,17 @@ public final class SynonymQuery extends Query {
super(query);
assert scoreMode.needsScores();
this.scoreMode = scoreMode;
CollectionStatistics collectionStats = searcher.collectionStatistics(terms[0].term.field());
CollectionStatistics collectionStats = searcher.collectionStatistics(field);
long docFreq = 0;
long totalTermFreq = 0;
termStates = new TermStates[terms.length];
for (int i = 0; i < termStates.length; i++) {
TermStates ts = TermStates.build(searcher.getTopReaderContext(), terms[i].term, true);
Term term = new Term(field, terms[i].term);
TermStates ts = TermStates.build(searcher.getTopReaderContext(), term, true);
termStates[i] = ts;
if (ts.docFreq() > 0) {
TermStatistics termStats =
searcher.termStatistics(terms[i].term, ts.docFreq(), ts.totalTermFreq());
searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq());
docFreq = Math.max(termStats.docFreq(), docFreq);
totalTermFreq += termStats.totalTermFreq();
}
@ -217,13 +226,12 @@ public final class SynonymQuery extends Query {
@Override
public Matches matches(LeafReaderContext context, int doc) throws IOException {
String field = terms[0].term.field();
Terms indexTerms = context.reader().terms(field);
if (indexTerms == null) {
return super.matches(context, doc);
}
List<Term> termList =
Arrays.stream(terms).map(TermAndBoost::getTerm).collect(Collectors.toList());
Arrays.stream(terms).map(t -> new Term(field, t.term)).collect(Collectors.toList());
return MatchesUtils.forField(
field,
() -> DisjunctionMatchesIterator.fromTerms(context, doc, getQuery(), field, termList));
@ -244,8 +252,7 @@ public final class SynonymQuery extends Query {
assert scorer instanceof TermScorer;
freq = ((TermScorer) scorer).freq();
}
LeafSimScorer docScorer =
new LeafSimScorer(simWeight, context.reader(), terms[0].term.field(), true);
LeafSimScorer docScorer = new LeafSimScorer(simWeight, context.reader(), field, true);
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
Explanation scoreExplanation = docScorer.explain(doc, freqExplanation);
return Explanation.match(
@ -271,8 +278,8 @@ public final class SynonymQuery extends Query {
for (int i = 0; i < terms.length; i++) {
TermState state = termStates[i].get(context);
if (state != null) {
TermsEnum termsEnum = context.reader().terms(terms[i].term.field()).iterator();
termsEnum.seekExact(terms[i].term.bytes(), state);
TermsEnum termsEnum = context.reader().terms(field).iterator();
termsEnum.seekExact(terms[i].term, state);
if (scoreMode == ScoreMode.TOP_SCORES) {
ImpactsEnum impactsEnum = termsEnum.impacts(PostingsEnum.FREQS);
iterators.add(impactsEnum);
@ -290,8 +297,7 @@ public final class SynonymQuery extends Query {
return null;
}
LeafSimScorer simScorer =
new LeafSimScorer(simWeight, context.reader(), terms[0].term.field(), true);
LeafSimScorer simScorer = new LeafSimScorer(simWeight, context.reader(), field, true);
// we must optimize this case (term not in segment), disjunctions require >= 2 subs
if (iterators.size() == 1) {
@ -443,7 +449,7 @@ public final class SynonymQuery extends Query {
}
PriorityQueue<SubIterator> pq =
new PriorityQueue<SubIterator>(impacts.length) {
new PriorityQueue<>(impacts.length) {
@Override
protected boolean lessThan(SubIterator a, SubIterator b) {
if (a.current == null) { // means iteration is finished
@ -626,18 +632,14 @@ public final class SynonymQuery extends Query {
}
private static class TermAndBoost {
final Term term;
final BytesRef term;
final float boost;
TermAndBoost(Term term, float boost) {
TermAndBoost(BytesRef term, float boost) {
this.term = term;
this.boost = boost;
}
Term getTerm() {
return term;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;

View File

@ -65,13 +65,13 @@ public class QueryBuilder {
/** Wraps a term and boost */
public static class TermAndBoost {
/** the term */
public final Term term;
public final BytesRef term;
/** the boost */
public final float boost;
/** Creates a new TermAndBoost */
public TermAndBoost(Term term, float boost) {
this.term = term;
public TermAndBoost(BytesRef term, float boost) {
this.term = BytesRef.deepCopyOf(term);
this.boost = boost;
}
}
@ -390,21 +390,24 @@ public class QueryBuilder {
stream.reset();
List<TermAndBoost> terms = new ArrayList<>();
while (stream.incrementToken()) {
terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
terms.add(new TermAndBoost(termAtt.getBytesRef(), boostAtt.getBoost()));
}
return newSynonymQuery(terms.toArray(new TermAndBoost[0]));
return newSynonymQuery(field, terms.toArray(TermAndBoost[]::new));
}
protected void add(
BooleanQuery.Builder q, List<TermAndBoost> current, BooleanClause.Occur operator) {
String field,
BooleanQuery.Builder q,
List<TermAndBoost> current,
BooleanClause.Occur operator) {
if (current.isEmpty()) {
return;
}
if (current.size() == 1) {
q.add(newTermQuery(current.get(0).term, current.get(0).boost), operator);
q.add(newTermQuery(new Term(field, current.get(0).term), current.get(0).boost), operator);
} else {
q.add(newSynonymQuery(current.toArray(new TermAndBoost[0])), operator);
q.add(newSynonymQuery(field, current.toArray(TermAndBoost[]::new)), operator);
}
}
@ -421,13 +424,12 @@ public class QueryBuilder {
stream.reset();
while (stream.incrementToken()) {
if (posIncrAtt.getPositionIncrement() != 0) {
add(q, currentQuery, operator);
add(field, q, currentQuery, operator);
currentQuery.clear();
}
currentQuery.add(
new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
currentQuery.add(new TermAndBoost(termAtt.getBytesRef(), boostAtt.getBoost()));
}
add(q, currentQuery, operator);
add(field, q, currentQuery, operator);
return q.build();
}
@ -518,7 +520,7 @@ public class QueryBuilder {
if (graph.hasSidePath(start)) {
final Iterator<TokenStream> sidePathsIterator = graph.getFiniteStrings(start, end);
Iterator<Query> queries =
new Iterator<Query>() {
new Iterator<>() {
@Override
public boolean hasNext() {
return sidePathsIterator.hasNext();
@ -544,14 +546,14 @@ public class QueryBuilder {
s -> {
TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class);
BoostAttribute b = s.addAttribute(BoostAttribute.class);
return new TermAndBoost(new Term(field, t.getBytesRef()), b.getBoost());
return new TermAndBoost(t.getBytesRef(), b.getBoost());
})
.toArray(TermAndBoost[]::new);
assert terms.length > 0;
if (terms.length == 1) {
positionalQuery = newTermQuery(terms[0].term, terms[0].boost);
positionalQuery = newTermQuery(new Term(field, terms[0].term), terms[0].boost);
} else {
positionalQuery = newSynonymQuery(terms);
positionalQuery = newSynonymQuery(field, terms);
}
}
if (positionalQuery != null) {
@ -599,8 +601,8 @@ public class QueryBuilder {
*
* @return new Query instance
*/
protected Query newSynonymQuery(TermAndBoost[] terms) {
SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].term.field());
protected Query newSynonymQuery(String field, TermAndBoost[] terms) {
SynonymQuery.Builder builder = new SynonymQuery.Builder(field);
for (TermAndBoost t : terms) {
builder.addTerm(t.term, t.boost);
}