mirror of https://github.com/apache/lucene.git
LUCENE-9349: TermInSetQuery should use consumeMatchingTerms in visit() (#1465)
TermInSetQuery currently iterates through all its prefix-encoded terms in order to build an array to pass back to its visitor when visit() is called. This seems like a waste, particularly when the visitor is not actually consuming the terms (for example, when doing a clause-count check before executing a search). This commit changes TermInSetQuery to use consumeTermsMatching(), and also changes the signature of this method so that it takes a BytesRunAutomaton supplier to allow for lazy instantiation. In addition, IndexSearcher's clause count check wasn't counting leaves that called consumeTermsMatching().
This commit is contained in:
parent
59a8e83520
commit
267d70b66b
|
@ -142,6 +142,11 @@ API Changes
|
|||
interface that offers the basic methods to acquire pending merges, run the merge and do accounting
|
||||
around it. (Simon Willnauer)
|
||||
|
||||
* LUCENE-9349: QueryVisitor.consumeTermsMatching() now takes a
|
||||
Supplier<ByteRunAutomaton> to enable queries that build large automata to
|
||||
provide them lazily. TermsInSetQuery switches to using this method
|
||||
to report matching terms. (Alan Woodward)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
(No changes)
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.util.concurrent.Executor;
|
|||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.FutureTask;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
|
@ -47,6 +48,7 @@ import org.apache.lucene.search.similarities.Similarity;
|
|||
import org.apache.lucene.store.NIOFSDirectory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.ThreadInterruptedException;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
|
||||
/** Implements search over a single IndexReader.
|
||||
*
|
||||
|
@ -790,6 +792,14 @@ public class IndexSearcher {
|
|||
}
|
||||
++numClauses;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
|
||||
if (numClauses > maxClauseCount) {
|
||||
throw new TooManyClauses();
|
||||
}
|
||||
++numClauses;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.util.Arrays;
|
||||
import java.util.Set;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
|
@ -43,11 +44,11 @@ public abstract class QueryVisitor {
|
|||
*
|
||||
* @param query the leaf query
|
||||
* @param field the field queried against
|
||||
* @param automaton an automaton defining which terms match
|
||||
* @param automaton a supplier for an automaton defining which terms match
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
|
||||
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
|
||||
visitLeaf(query); // default impl for backward compatibility
|
||||
}
|
||||
|
||||
|
|
|
@ -43,6 +43,11 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.DocIdSetBuilder;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.automaton.Automata;
|
||||
import org.apache.lucene.util.automaton.Automaton;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.Operations;
|
||||
|
||||
/**
|
||||
* Specialization for a disjunction over many terms that behaves like a
|
||||
|
@ -126,13 +131,22 @@ public class TermInSetQuery extends Query implements Accountable {
|
|||
if (visitor.acceptField(field) == false) {
|
||||
return;
|
||||
}
|
||||
QueryVisitor v = visitor.getSubVisitor(Occur.SHOULD, this);
|
||||
List<Term> terms = new ArrayList<>();
|
||||
TermIterator iterator = termData.iterator();
|
||||
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||
terms.add(new Term(field, BytesRef.deepCopyOf(term)));
|
||||
if (termData.size() == 1) {
|
||||
visitor.consumeTerms(this, new Term(field, termData.iterator().next()));
|
||||
}
|
||||
v.consumeTerms(this, terms.toArray(new Term[0]));
|
||||
if (termData.size() > 1) {
|
||||
visitor.consumeTermsMatching(this, field, this::asByteRunAutomaton);
|
||||
}
|
||||
}
|
||||
|
||||
private ByteRunAutomaton asByteRunAutomaton() {
|
||||
TermIterator iterator = termData.iterator();
|
||||
List<Automaton> automata = new ArrayList<>();
|
||||
for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
|
||||
automata.add(Automata.makeBinary(term));
|
||||
}
|
||||
return new CompiledAutomaton(Operations.union(automata)).runAutomaton;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -354,12 +354,12 @@ public class CompiledAutomaton implements Accountable {
|
|||
if (visitor.acceptField(field)) {
|
||||
switch (type) {
|
||||
case NORMAL:
|
||||
visitor.consumeTermsMatching(parent, field, runAutomaton);
|
||||
visitor.consumeTermsMatching(parent, field, () -> runAutomaton);
|
||||
break;
|
||||
case NONE:
|
||||
break;
|
||||
case ALL:
|
||||
visitor.consumeTermsMatching(parent, field, new ByteRunAutomaton(Automata.makeAnyString()));
|
||||
visitor.consumeTermsMatching(parent, field, () -> new ByteRunAutomaton(Automata.makeAnyString()));
|
||||
break;
|
||||
case SINGLE:
|
||||
visitor.consumeTerms(parent, new Term(field, term));
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -44,6 +45,7 @@ import org.apache.lucene.util.IOUtils;
|
|||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.RamUsageTester;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.apache.lucene.util.automaton.ByteRunAutomaton;
|
||||
|
||||
public class TermInSetQueryTest extends LuceneTestCase {
|
||||
|
||||
|
@ -297,4 +299,44 @@ public class TermInSetQueryTest extends LuceneTestCase {
|
|||
// cached after two uses
|
||||
assertTrue(policy.shouldCache(query));
|
||||
}
|
||||
|
||||
public void testVisitor() {
|
||||
// singleton reports back to consumeTerms()
|
||||
TermInSetQuery singleton = new TermInSetQuery("field", new BytesRef("term1"));
|
||||
singleton.visit(new QueryVisitor() {
|
||||
@Override
|
||||
public void consumeTerms(Query query, Term... terms) {
|
||||
assertEquals(1, terms.length);
|
||||
assertEquals(new Term("field", new BytesRef("term1")), terms[0]);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
|
||||
fail("Singleton TermInSetQuery should not try to build ByteRunAutomaton");
|
||||
}
|
||||
});
|
||||
|
||||
// multiple values built into automaton
|
||||
List<BytesRef> terms = new ArrayList<>();
|
||||
for (int i = 0; i < 100; i++) {
|
||||
terms.add(new BytesRef("term" + i));
|
||||
}
|
||||
TermInSetQuery t = new TermInSetQuery("field", terms);
|
||||
t.visit(new QueryVisitor() {
|
||||
@Override
|
||||
public void consumeTerms(Query query, Term... terms) {
|
||||
fail("TermInSetQuery with multiple terms should build automaton");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
|
||||
ByteRunAutomaton a = automaton.get();
|
||||
BytesRef test = new BytesRef("nonmatching");
|
||||
assertFalse(a.run(test.bytes, test.offset, test.length));
|
||||
for (BytesRef term : terms) {
|
||||
assertTrue(a.run(term.bytes, term.offset, term.length));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search.uhighlight;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import org.apache.lucene.search.AutomatonQuery;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
|
@ -80,8 +81,8 @@ final class MultiTermHighlighting {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void consumeTermsMatching(Query query, String field, ByteRunAutomaton automaton) {
|
||||
runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton));
|
||||
public void consumeTermsMatching(Query query, String field, Supplier<ByteRunAutomaton> automaton) {
|
||||
runAutomata.add(LabelledCharArrayMatcher.wrap(query.toString(), automaton.get()));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue