LUCENE-9349: TermInSetQuery should use consumeMatchingTerms in visit() (#1465)

TermInSetQuery currently iterates through all its prefix-encoded terms
in order to build an array to pass back to its visitor when visit() is called.
This seems like a waste, particularly when the visitor is not actually
consuming the terms (for example, when doing a clause-count check
before executing a search). This commit changes TermInSetQuery to use
consumeTermsMatching(), and also changes the signature of this method so
that it takes a BytesRunAutomaton supplier to allow for lazy instantiation. In
addition, IndexSearcher's clause count check wasn't counting leaves that
called consumeTermsMatching().
This commit is contained in:
Alan Woodward 2020-04-29 10:19:05 +01:00 committed by GitHub
parent 59a8e83520
commit 267d70b66b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 85 additions and 12 deletions

View File

@ -142,6 +142,11 @@ API Changes
interface that offers the basic methods to acquire pending merges, run the merge and do accounting
around it. (Simon Willnauer)
* LUCENE-9349: QueryVisitor.consumeTermsMatching() now takes a
Supplier<ByteRunAutomaton> to enable queries that build large automata to
provide them lazily. TermsInSetQuery switches to using this method
to report matching terms. (Alan Woodward)
New Features
---------------------
(No changes)

View File

@ -31,6 +31,7 @@ import java.util.concurrent.Executor;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.function.Supplier;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
@ -47,6 +48,7 @@ import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
/** Implements search over a single IndexReader.
*
@ -790,6 +792,14 @@ public class IndexSearcher {
}
++numClauses;
}
@Override
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
if (numClauses > maxClauseCount) {
throw new TooManyClauses();
}
++numClauses;
}
};
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import java.util.Arrays;
import java.util.Set;
import java.util.function.Supplier;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
@ -43,11 +44,11 @@ public abstract class QueryVisitor {
*
* @param query the leaf query
* @param field the field queried against
* @param automaton an automaton defining which terms match
* @param automaton a supplier for an automaton defining which terms match
*
* @lucene.experimental
*/
public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
visitLeaf(query); // default impl for backward compatibility
}

View File

@ -43,6 +43,11 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.DocIdSetBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Operations;
/**
* Specialization for a disjunction over many terms that behaves like a
@ -126,13 +131,22 @@ public class TermInSetQuery extends Query implements Accountable {
if (visitor.acceptField(field) == false) {
return;
}
QueryVisitor v = visitor.getSubVisitor(Occur.SHOULD, this);
List<Term> terms = new ArrayList<>();
TermIterator iterator = termData.iterator();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
terms.add(new Term(field, BytesRef.deepCopyOf(term)));
if (termData.size() == 1) {
visitor.consumeTerms(this, new Term(field, termData.iterator().next()));
}
v.consumeTerms(this, terms.toArray(new Term[0]));
if (termData.size() > 1) {
visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton);
}
}
private ByteRunAutomaton asByteRunAutomaton() {
TermIterator iterator = termData.iterator();
List<Automaton> automata = new ArrayList<>();
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
automata.add(Automata.makeBinary(term));
}
return new CompiledAutomaton(Operations.union(automata)).runAutomaton;
}
@Override

View File

@ -354,12 +354,12 @@ public class CompiledAutomaton implements Accountable {
if (visitor.acceptField(field)) {
switch (type) {
case NORMAL:
visitor.consumeTermsMatching(parent, field, runAutomaton);
visitor.consumeTermsMatching(parent, field, () -> runAutomaton);
break;
case NONE:
break;
case ALL:
visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString()));
visitor.consumeTermsMatching(parent, field, () -> new ByteRunAutomaton(Automata.makeAnyString()));
break;
case SINGLE:
visitor.consumeTerms(parent, new Term(field, term));

View File

@ -23,6 +23,7 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Supplier;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
import org.apache.lucene.document.Document;
@ -44,6 +45,7 @@ import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageTester;
import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
public class TermInSetQueryTest extends LuceneTestCase {
@ -297,4 +299,44 @@ public class TermInSetQueryTest extends LuceneTestCase {
// cached after two uses
assertTrue(policy.shouldCache(query));
}
public void testVisitor() {
// singleton reports back to consumeTerms()
TermInSetQuery singleton = new TermInSetQuery("field", new BytesRef("term1"));
singleton.visit(new QueryVisitor() {
@Override
public void consumeTerms(Query query, Term... terms) {
assertEquals(1, terms.length);
assertEquals(new Term("field", new BytesRef("term1")), terms[0]);
}
@Override
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
fail("Singleton TermInSetQuery should not try to build ByteRunAutomaton");
}
});
// multiple values built into automaton
List<BytesRef> terms = new ArrayList<>();
for (int i = 0; i < 100; i++) {
terms.add(new BytesRef("term" + i));
}
TermInSetQuery t = new TermInSetQuery("field", terms);
t.visit(new QueryVisitor() {
@Override
public void consumeTerms(Query query, Term... terms) {
fail("TermInSetQuery with multiple terms should build automaton");
}
@Override
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
ByteRunAutomaton a = automaton.get();
BytesRef test = new BytesRef("nonmatching");
assertFalse(a.run(test.bytes, test.offset, test.length));
for (BytesRef term : terms) {
assertTrue(a.run(term.bytes, term.offset, term.length));
}
}
});
}
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search.uhighlight;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
@ -80,8 +81,8 @@ final class MultiTermHighlighting {
}
@Override
public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton));
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton.get()));
}
}