From d764bf345e2789589fbead7df5838dc20247c577 Mon Sep 17 00:00:00 2001 From: Bruno Roustant Date: Wed, 18 Sep 2019 17:43:53 +0200 Subject: [PATCH] LUCENE-8983: Add PhraseWildcardQuery to control multi-terms expansions in phrase. --- lucene/CHANGES.txt | 2 + .../apache/lucene/search/MultiTermQuery.java | 11 +- .../org/apache/lucene/search/PhraseQuery.java | 16 + .../lucene/search/PhraseWildcardQuery.java | 1053 +++++++++++++++++ .../search/TestPhraseWildcardQuery.java | 570 +++++++++ 5 files changed, 1648 insertions(+), 4 deletions(-) create mode 100644 lucene/sandbox/src/java/org/apache/lucene/search/PhraseWildcardQuery.java create mode 100644 lucene/sandbox/src/test/org/apache/lucene/search/TestPhraseWildcardQuery.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2599ce2f0c5..4873b2b0554 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -79,6 +79,8 @@ Other * LUCENE-9046: Fix wrong example in Javadoc of TermInSetQuery (Namgyu Kim) +* LUCENE-8983: Add sandbox PhraseWildcardQuery to control multi-terms expansions in a phrase. (Bruno Roustant) + Build * Upgrade forbiddenapis to version 2.7; upgrade Groovy to 2.4.17. (Uwe Schindler) diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java index 636a7d6757a..6764d717121 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiTermQuery.java @@ -292,11 +292,14 @@ public abstract class MultiTermQuery extends Query { */ protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException; - /** Convenience method, if no attributes are needed: - * This simply passes empty attributes and is equal to: - * getTermsEnum(terms, new AttributeSource()) + /** + * Constructs an enumeration that expands the pattern term. + * This method should only be called if the field exists (ie, + * implementations can assume the field does exist). + * This method never returns null. + * The returned TermsEnum is positioned to the first matching term. */ - protected final TermsEnum getTermsEnum(Terms terms) throws IOException { + public final TermsEnum getTermsEnum(Terms terms) throws IOException { return getTermsEnum(terms, new AttributeSource()); } diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index b4d275e9203..1246f0482e8 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -332,6 +332,22 @@ public class PhraseQuery extends Query { } } + public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, List terms) { + this.postings = postings; + this.impacts = impacts; + this.position = position; + nTerms = terms == null ? 0 : terms.size(); + if (nTerms > 0) { + Term[] terms2 = terms.toArray(new Term[0]); + if (nTerms > 1) { + Arrays.sort(terms2); + } + this.terms = terms2; + } else { + this.terms = null; + } + } + @Override public int compareTo(PostingsAndFreq other) { if (position != other.position) { diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/PhraseWildcardQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/PhraseWildcardQuery.java new file mode 100644 index 00000000000..25623696aab --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/search/PhraseWildcardQuery.java @@ -0,0 +1,1053 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SlowImpactsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.TermStates; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.similarities.Similarity; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.mutable.MutableValueBool; + +/** + * A generalized version of {@link PhraseQuery}, built with one or more {@link MultiTermQuery} + * that provides term expansions for multi-terms (one of the expanded terms must match). + *

+ * Its main advantage is to control the total number of expansions across all {@link MultiTermQuery} + * and across all segments. + *

+ * Use the {@link Builder} to build a {@link PhraseWildcardQuery}. + *

+ * This query is similar to {@link MultiPhraseQuery}, but it handles, controls and optimizes the + * multi-term expansions. + *

+ * This query is equivalent to building an ordered {@link org.apache.lucene.search.spans.SpanNearQuery} + * with a list of {@link org.apache.lucene.search.spans.SpanTermQuery} and + * {@link org.apache.lucene.search.spans.SpanMultiTermQueryWrapper}. + * But it optimizes the multi-term expansions and the segment accesses. + * It first resolves the single-terms to early stop if some does not match. Then + * it expands each multi-term sequentially, stopping immediately if one does not + * match. It detects the segments that do not match to skip them for the next + * expansions. This often avoid expanding the other multi-terms on some or + * even all segments. And finally it controls the total number of expansions. + *

+ * Immutable. + * @lucene.experimental + */ +public class PhraseWildcardQuery extends Query { + + protected static final Query NO_MATCH_QUERY = new MatchNoDocsQuery("Empty " + PhraseWildcardQuery.class.getSimpleName()); + + protected final String field; + protected final List phraseTerms; + protected final int slop; + protected final int maxMultiTermExpansions; + protected final boolean segmentOptimizationEnabled; + + protected PhraseWildcardQuery( + String field, + List phraseTerms, + int slop, + int maxMultiTermExpansions, + boolean segmentOptimizationEnabled) { + this.field = field; + this.phraseTerms = phraseTerms; + this.slop = slop; + this.maxMultiTermExpansions = maxMultiTermExpansions; + this.segmentOptimizationEnabled = segmentOptimizationEnabled; + } + + public String getField() { + return field; + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + if (phraseTerms.isEmpty()) { + return NO_MATCH_QUERY; + } + if (phraseTerms.size() == 1) { + return phraseTerms.get(0).getQuery(); + } + return super.rewrite(reader); + } + + @Override + public void visit(QueryVisitor visitor) { + if (!visitor.acceptField(field)) { + return; + } + QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.MUST, this); + for (PhraseTerm phraseTerm : phraseTerms) { + phraseTerm.getQuery().visit(v); + } + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { + IndexReader reader = searcher.getIndexReader(); + + // Build a list of segments ordered by terms size (number of terms). + // The first segments to be searched are the smaller ones, which are by + // design containing the most recent documents. Any segment in this list + // may also be removed in the PhraseTerm.collectTermData() calls below + // if one of the phrase term does not match in the segment. This allows + // to early stop expanding multi-terms on removed segments. + // Additionally there is a global multi-term expansion limit across all multi-terms + // and all segments. So this is important to first start with the smallest + // segments to give back non-used expansion credits to the next multi-terms, + // as this is more probable with the small segments. + List sizeSortedSegments = + new SegmentTermsSizeComparator().createTermsSizeSortedCopyOf(reader.leaves()); + + // TermsData will contain the collected TermState and TermStatistics for all the terms + // of the phrase. It is filled during PhraseTerm.collectTermData() calls below. + TermsData termsData = createTermsData(sizeSortedSegments.size()); + + // Iterate the phrase terms, and collect the TermState for single-terms. + // - Early stop if a single term does not match. + int numMultiTerms = 0; + for (PhraseTerm phraseTerm : phraseTerms) { + if (phraseTerm.hasExpansions()) { + numMultiTerms++; + } else { + assert TestCounters.get().incSingleTermAnalysisCount(); + int numMatches = phraseTerm.collectTermData(this, searcher, sizeSortedSegments, termsData); + if (numMatches == 0) { + // Early stop here because the single term does not match in any segment. + // So the whole phrase query cannot match. + return earlyStopWeight(); + } + } + } + + // Iterate the phrase terms and collect the TermState for multi-terms. + // - Early stop if a multi-term does not match. + // - Expand the multi-terms only when required. + int remainingExpansions = maxMultiTermExpansions; + int remainingMultiTerms = numMultiTerms; + for (PhraseTerm phraseTerm : phraseTerms) { + if (phraseTerm.hasExpansions()) { + assert TestCounters.get().incMultiTermAnalysisCount(); + assert remainingExpansions >= 0 && remainingExpansions <= maxMultiTermExpansions; + assert remainingMultiTerms > 0; + // Consider the remaining expansions allowed for all remaining multi-terms. + // Divide it evenly to get the expansion limit for the current multi-term. + int maxExpansionsForTerm = remainingExpansions / remainingMultiTerms; + int numExpansions = phraseTerm.collectTermData(this, searcher, sizeSortedSegments, remainingMultiTerms, maxExpansionsForTerm, termsData); + assert numExpansions >= 0 && numExpansions <= maxExpansionsForTerm; + if (numExpansions == 0) { + // Early stop here because the multi-term does not match in any segment. + // So the whole phrase query cannot match. + return earlyStopWeight(); + } + // Deduct the effectively used expansions. This may give more expansion + // credits to the next multi-terms. + remainingExpansions -= numExpansions; + remainingMultiTerms--; + } + } + assert remainingMultiTerms == 0; + assert remainingExpansions >= 0; + +// TestCounters.get().printTestCounters(termsData); + + return termsData.areAllTermsMatching() ? + createPhraseWeight(searcher, scoreMode, boost, termsData) + : noMatchWeight(); + } + + /** + * Creates new {@link TermsData}. + */ + protected TermsData createTermsData(int numSegments) { + return new TermsData(phraseTerms.size(), numSegments); + } + + protected Weight earlyStopWeight() { + assert TestCounters.get().incQueryEarlyStopCount(); + return noMatchWeight(); + } + + protected Weight noMatchWeight() { + return new ConstantScoreWeight(this, 0) { + @Override + public Scorer scorer(LeafReaderContext leafReaderContext) { + return null; + } + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return true; + } + }; + } + + PhraseWeight createPhraseWeight(IndexSearcher searcher, ScoreMode scoreMode, + float boost, TermsData termsData) throws IOException { + return new PhraseWeight(this, field, searcher, scoreMode) { + + @Override + protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException { + if (termsData.termStatsList.isEmpty()) { + return null; + } + return searcher.getSimilarity().scorer( + boost, + searcher.collectionStatistics(field), + termsData.termStatsList.toArray(new TermStatistics[0])); + } + + @Override + protected PhraseMatcher getPhraseMatcher(LeafReaderContext leafReaderContext, Similarity.SimScorer scorer, boolean exposeOffsets) throws IOException { + Terms fieldTerms = leafReaderContext.reader().terms(field); + if (fieldTerms == null) { + return null; + } + TermsEnum termsEnum = fieldTerms.iterator(); + float totalMatchCost = 0; + + PhraseQuery.PostingsAndFreq[] postingsFreqs = new PhraseQuery.PostingsAndFreq[phraseTerms.size()]; + for (int termPosition = 0; termPosition < postingsFreqs.length; termPosition++) { + TermData termData = termsData.getTermData(termPosition); + assert termData != null; + List termStates = termData.getTermStatesForSegment(leafReaderContext); + if (termStates == null) { + // If the current phrase term does not match in the segment, then the phrase cannot match on the segment. + // So early stop by returning a null scorer. + return null; + } + assert !termStates.isEmpty(); + + List postingsEnums = new ArrayList<>(termStates.size()); + for (TermBytesTermState termBytesTermState : termStates) { + termsEnum.seekExact(termBytesTermState.termBytes, termBytesTermState.termState); + postingsEnums.add(termsEnum.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS)); + totalMatchCost += PhraseQuery.termPositionsCost(termsEnum); + } + PostingsEnum unionPostingsEnum; + if (postingsEnums.size() == 1) { + unionPostingsEnum = postingsEnums.get(0); + } else { + unionPostingsEnum = exposeOffsets ? new MultiPhraseQuery.UnionFullPostingsEnum(postingsEnums) : new MultiPhraseQuery.UnionPostingsEnum(postingsEnums); + } + postingsFreqs[termPosition] = new PhraseQuery.PostingsAndFreq(unionPostingsEnum, new SlowImpactsEnum(unionPostingsEnum), termPosition, termData.terms); + } + + if (slop == 0) { + // Sort by increasing docFreq order. + ArrayUtil.timSort(postingsFreqs); + return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost); + } else { + return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets); + } + } + + @Override + public void extractTerms(Set terms) { + for (int i = 0, size = phraseTerms.size(); i < size; i++) { + terms.addAll(termsData.getTermData(i).terms); + } + } + }; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof PhraseWildcardQuery)) { + return false; + } + PhraseWildcardQuery pwq = (PhraseWildcardQuery) o; + return slop == pwq.slop && phraseTerms.equals(pwq.phraseTerms); + } + + @Override + public int hashCode() { + return classHash() ^ slop ^ phraseTerms.hashCode(); + } + + @Override + public final String toString(String omittedField) { + StringBuilder builder = new StringBuilder(); + builder.append("phraseWildcard("); + + if (field == null || !field.equals(omittedField)) { + builder.append(field).append(':'); + } + + builder.append('\"'); + for (int i = 0; i < phraseTerms.size(); i++) { + if (i != 0) { + builder.append(' '); + } + phraseTerms.get(i).toString(builder); + } + builder.append('\"'); + + if (slop != 0) { + builder.append('~'); + builder.append(slop); + } + + builder.append(")"); + return builder.toString(); + } + + /** + * Collects the {@link TermState} and {@link TermStatistics} for a single-term + * without expansion. + * + * @param termsData receives the collected data. + */ + protected int collectSingleTermData( + SingleTerm singleTerm, + IndexSearcher searcher, + List segments, + TermsData termsData) throws IOException { + TermData termData = termsData.getOrCreateTermData(singleTerm.termPosition); + Term term = singleTerm.term; + termData.terms.add(term); + TermStates termStates = TermStates.build(searcher.getIndexReader().getContext(), term, true); + + // Collect TermState per segment. + int numMatches = 0; + Iterator segmentIterator = segments.iterator(); + while (segmentIterator.hasNext()) { + LeafReaderContext leafReaderContext = segmentIterator.next(); + assert TestCounters.get().incSegmentUseCount(); + boolean termMatchesInSegment = false; + Terms terms = leafReaderContext.reader().terms(term.field()); + if (terms != null) { + checkTermsHavePositions(terms); + TermState termState = termStates.get(leafReaderContext); + if (termState != null) { + termMatchesInSegment = true; + numMatches++; + termData.setTermStatesForSegment(leafReaderContext, Collections.singletonList(new TermBytesTermState(term.bytes(), termState))); + } + } + if (!termMatchesInSegment && shouldOptimizeSegments()) { + // Remove this segment from the list because the phrase cannot match on it. + segmentIterator.remove(); + assert TestCounters.get().incSegmentSkipCount(); + } + } + // Collect the term stats across all segments. + if (termStates.docFreq() > 0) { + termsData.termStatsList.add(searcher.termStatistics(term, termStates.docFreq(), termStates.totalTermFreq())); + } + return numMatches; + } + + /** + * Collects the {@link TermState} and {@link TermStatistics} for a multi-term + * with expansion. + * + * @param remainingMultiTerms the number of remaining multi-terms to process, + * including the current one, excluding the multi-terms already processed. + * @param termsData receives the collected data. + */ + protected int collectMultiTermData( + MultiTerm multiTerm, + IndexSearcher searcher, + List segments, + int remainingMultiTerms, // Unused here but leveraged by extending classes. + int maxExpansionsForTerm, + TermsData termsData) throws IOException { + TermData termData = termsData.getOrCreateTermData(multiTerm.termPosition); + Map termStatsMap = createTermStatsMap(multiTerm); + int numExpansions = 0; + Iterator segmentIterator = segments.iterator(); + MutableValueBool shouldStopSegmentIteration = new MutableValueBool(); + + while (segmentIterator.hasNext() && !shouldStopSegmentIteration.value) { + LeafReaderContext leafReaderContext = segmentIterator.next(); + int remainingExpansions = maxExpansionsForTerm - numExpansions; + assert remainingExpansions >= 0; + List termStates = collectMultiTermDataForSegment( + multiTerm, leafReaderContext, remainingExpansions, shouldStopSegmentIteration, termStatsMap); + + if (!termStates.isEmpty()) { + assert termStates.size() <= remainingExpansions; + numExpansions += termStates.size(); + assert numExpansions <= maxExpansionsForTerm; + termData.setTermStatesForSegment(leafReaderContext, termStates); + + } else if (shouldOptimizeSegments()) { + // Remove this segment from the list because the phrase cannot match on it. + segmentIterator.remove(); + assert TestCounters.get().incSegmentSkipCount(); + } + } + + // Collect the term stats across all segments. + collectMultiTermStats(searcher, termStatsMap, termsData, termData); + return numExpansions; + } + + protected boolean shouldOptimizeSegments() { + return segmentOptimizationEnabled; + } + + /** + * Creates a {@link TermStats} map for a {@link MultiTerm}. + */ + protected Map createTermStatsMap(MultiTerm multiTerm) { // multiTerm param can be used by sub-classes. + return new HashMap<>(); + } + + /** + * Collects the {@link TermState} list and {@link TermStatistics} for a multi-term + * on a specific index segment. + * + * @param remainingExpansions the number of remaining expansions allowed + * for the segment. + * @param shouldStopSegmentIteration to be set to true to stop the segment + * iteration calling this method repeatedly. + * @param termStatsMap receives the collected {@link TermStats} across all segments. + */ + protected List collectMultiTermDataForSegment( + MultiTerm multiTerm, + LeafReaderContext leafReaderContext, + int remainingExpansions, + MutableValueBool shouldStopSegmentIteration, + Map termStatsMap) throws IOException { + TermsEnum termsEnum = createTermsEnum(multiTerm, leafReaderContext); + if (termsEnum == null) { + return Collections.emptyList(); + } + assert TestCounters.get().incSegmentUseCount(); + List termStates = new ArrayList<>(); + while (termsEnum.next() != null && remainingExpansions > 0) { + // Collect term stats for the segment. + TermStats termStats = termStatsMap.get(termsEnum.term()); + if (termStats == null) { + BytesRef termBytes = BytesRef.deepCopyOf(termsEnum.term()); + termStats = new TermStats(termBytes); + termStatsMap.put(termBytes, termStats); + } + // Accumulate stats the same way TermStates.accumulateStatistics() does. + // Sum the stats per term for all segments the same way TermStates.build() does. + termStats.addStats(termsEnum.docFreq(), termsEnum.totalTermFreq()); + + // Collect TermState per segment. + termStates.add(new TermBytesTermState(termStats.termBytes, termsEnum.termState())); + remainingExpansions--; + assert TestCounters.get().incExpansionCount(); + } + assert remainingExpansions >= 0; + shouldStopSegmentIteration.value = remainingExpansions == 0; + return termStates; + } + + /** + * Creates the {@link TermsEnum} for the given {@link MultiTerm} and segment. + * + * @return null if there is no term for this query field in the segment. + */ + protected TermsEnum createTermsEnum(MultiTerm multiTerm, LeafReaderContext leafReaderContext) throws IOException { + Terms terms = leafReaderContext.reader().terms(field); + if (terms == null) { + return null; + } + checkTermsHavePositions(terms); + TermsEnum termsEnum = multiTerm.query.getTermsEnum(terms); + assert termsEnum != null; + return termsEnum; + } + + /** + * Collect the term stats across all segments. + * + * @param termStatsMap input map of already collected {@link TermStats}. + * @param termsData receives the {@link TermStatistics} computed for all {@link TermStats}. + * @param termData receives all the collected {@link Term}. + */ + protected void collectMultiTermStats( + IndexSearcher searcher, + Map termStatsMap, + TermsData termsData, + TermData termData) throws IOException { + // Collect term stats across all segments. + // Collect stats the same way MultiPhraseQuery.MultiPhraseWeight constructor does, for all terms and all segments. + for (Map.Entry termStatsEntry : termStatsMap.entrySet()) { + Term term = new Term(field, termStatsEntry.getKey()); + termData.terms.add(term); + TermStats termStats = termStatsEntry.getValue(); + if (termStats.docFreq > 0) { + termsData.termStatsList.add(searcher.termStatistics(term, termStats.docFreq, termStats.totalTermFreq)); + } + } + } + + protected void checkTermsHavePositions(Terms terms) { + if (!terms.hasPositions()) { + throw new IllegalStateException("field \"" + field + "\" was indexed without position data;" + + " cannot run " + PhraseWildcardQuery.class.getSimpleName()); + } + } + + /** + * Builds a {@link PhraseWildcardQuery}. + */ + public static class Builder { + + protected final String field; + protected final List phraseTerms; + protected int slop; + protected final int maxMultiTermExpansions; + protected final boolean segmentOptimizationEnabled; + + /** + * @param field The query field. + * @param maxMultiTermExpansions The maximum number of expansions across all multi-terms and across all segments. + * It counts expansions for each segments individually, that allows optimizations per + * segment and unused expansions are credited to next segments. This is different from + * {@link MultiPhraseQuery} and {@link org.apache.lucene.search.spans.SpanMultiTermQueryWrapper} + * which have an expansion limit per multi-term. + */ + public Builder(String field, int maxMultiTermExpansions) { + this(field, maxMultiTermExpansions, true); + } + + /** + * @param field The query field. + * @param maxMultiTermExpansions The maximum number of expansions across all multi-terms and across all segments. + * It counts expansions for each segments individually, that allows optimizations per + * segment and unused expansions are credited to next segments. This is different from + * {@link MultiPhraseQuery} and {@link org.apache.lucene.search.spans.SpanMultiTermQueryWrapper} + * which have an expansion limit per multi-term. + * @param segmentOptimizationEnabled Whether to enable the segment optimization which consists in ignoring a segment + * for further analysis as soon as a term is not present inside it. This optimizes + * the query execution performance but changes the scoring. The result ranking is + * preserved. + */ + public Builder(String field, int maxMultiTermExpansions, boolean segmentOptimizationEnabled) { + this.field = field; + this.maxMultiTermExpansions = maxMultiTermExpansions; + this.segmentOptimizationEnabled = segmentOptimizationEnabled; + phraseTerms = new ArrayList<>(); + } + + /** + * Adds a single term at the next position in the phrase. + */ + public Builder addTerm(BytesRef termBytes) { + return addTerm(new Term(field, termBytes)); + } + + /** + * Adds a single term at the next position in the phrase. + */ + public Builder addTerm(Term term) { + if (!term.field().equals(field)) { + throw new IllegalArgumentException(term.getClass().getSimpleName() + + " field \"" + term.field() + "\" cannot be different from the " + + PhraseWildcardQuery.class.getSimpleName() + " field \"" + field + "\""); + } + phraseTerms.add(new SingleTerm(term, phraseTerms.size())); + return this; + } + + /** + * Adds a multi-term at the next position in the phrase. + * Any of the terms returned by the provided {@link MultiTermQuery} enumeration + * may match (expansion as a disjunction). + */ + public Builder addMultiTerm(MultiTermQuery multiTermQuery) { + if (!multiTermQuery.getField().equals(field)) { + throw new IllegalArgumentException(multiTermQuery.getClass().getSimpleName() + + " field \"" + multiTermQuery.getField() + "\" cannot be different from the " + + PhraseWildcardQuery.class.getSimpleName() + " field \"" + field + "\""); + } + phraseTerms.add(new MultiTerm(multiTermQuery, phraseTerms.size())); + return this; + } + + /** + * Sets the phrase slop. + */ + public Builder setSlop(int slop) { + if (slop < 0) { + throw new IllegalArgumentException("slop value cannot be negative"); + } + this.slop = slop; + return this; + } + + /** + * Builds a {@link PhraseWildcardQuery}. + */ + public PhraseWildcardQuery build() { + return new PhraseWildcardQuery(field, phraseTerms, slop, maxMultiTermExpansions, segmentOptimizationEnabled); + } + } + + /** + * All {@link PhraseTerm} are light and immutable. They do not hold query + * processing data such as {@link TermsData}. That way, the {@link PhraseWildcardQuery} + * is immutable and light itself and can be used safely as a key of the query cache. + */ + protected abstract static class PhraseTerm { + + protected final int termPosition; + + protected PhraseTerm(int termPosition) { + this.termPosition = termPosition; + } + + protected abstract boolean hasExpansions(); + + protected abstract Query getQuery(); + + /** + * Collects {@link TermState} and {@link TermStatistics} for the term without expansion. + * It must be called only if {@link #hasExpansions()} returns false. + * Simplified version of {@code #collectTermData(PhraseWildcardQuery, IndexSearcher, List, int, int, TermsData)} + * with less arguments. This method throws {@link UnsupportedOperationException} if not overridden. + */ + protected int collectTermData( + PhraseWildcardQuery query, + IndexSearcher searcher, + List segments, + TermsData termsData) throws IOException { + throw new UnsupportedOperationException(); + } + + /** + * Collects {@link TermState} and {@link TermStatistics} for the term (potentially expanded). + * + * @param termsData {@link TermsData} to update with the collected terms and stats. + * @return The number of expansions or matches in all segments; or 0 if this term + * does not match in any segment, in this case the phrase query can immediately stop. + */ + protected abstract int collectTermData( + PhraseWildcardQuery query, + IndexSearcher searcher, + List segments, + int remainingMultiTerms, + int maxExpansionsForTerm, + TermsData termsData) throws IOException; + + protected abstract void toString(StringBuilder builder); + + @Override + public abstract boolean equals(Object o); + + @Override + public abstract int hashCode(); + } + + /** + * Phrase term with no expansion. + */ + protected static class SingleTerm extends PhraseTerm { + + protected final Term term; + + protected SingleTerm(Term term, int termPosition) { + super(termPosition); + this.term = term; + } + + @Override + protected boolean hasExpansions() { + return false; + } + + @Override + protected Query getQuery() { + return new TermQuery(term); + } + + @Override + protected int collectTermData( + PhraseWildcardQuery query, + IndexSearcher searcher, + List segments, + TermsData termsData) throws IOException { + return collectTermData(query, searcher, segments, 0, 0, termsData); + } + + @Override + protected int collectTermData( + PhraseWildcardQuery query, + IndexSearcher searcher, + List segments, + int remainingMultiTerms, + int maxExpansionsForTerm, + TermsData termsData) throws IOException { + return query.collectSingleTermData(this, searcher, segments, termsData); + } + + @Override + protected void toString(StringBuilder builder) { + builder.append(term.text()); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof SingleTerm)) { + return false; + } + SingleTerm singleTerm = (SingleTerm) o; + return term.equals(singleTerm.term); + } + + @Override + public int hashCode() { + return term.hashCode(); + } + } + + /** + * Phrase term with expansions. + */ + protected static class MultiTerm extends PhraseTerm { + + protected final MultiTermQuery query; + + protected MultiTerm(MultiTermQuery query, int termPosition) { + super(termPosition); + this.query = query; + } + + @Override + protected boolean hasExpansions() { + return true; + } + + @Override + protected Query getQuery() { + return query; + } + + @Override + protected int collectTermData( + PhraseWildcardQuery query, + IndexSearcher searcher, + List segments, + int remainingMultiTerms, + int maxExpansionsForTerm, + TermsData termsData) throws IOException { + return query.collectMultiTermData(this, searcher, segments, remainingMultiTerms, maxExpansionsForTerm, termsData); + } + + @Override + protected void toString(StringBuilder builder) { + builder.append(query.toString(query.field)); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof MultiTerm)) { + return false; + } + MultiTerm multiTerm = (MultiTerm) o; + return query.equals(multiTerm.query); + } + + @Override + public int hashCode() { + return query.hashCode(); + } + } + + /** + * Holds the {@link TermState} and {@link TermStatistics} for all the matched + * and collected {@link Term}, for all phrase terms, for all segments. + */ + protected static class TermsData { + + protected final int numTerms; + protected final int numSegments; + protected final List termStatsList; + protected final TermData[] termDataPerPosition; + protected int numTermsMatching; + + protected TermsData(int numTerms, int numSegments) { + this.numTerms = numTerms; + this.numSegments = numSegments; + termStatsList = new ArrayList<>(); + termDataPerPosition = new TermData[numTerms]; + } + + protected TermData getOrCreateTermData(int termPosition) { + TermData termData = termDataPerPosition[termPosition]; + if (termData == null) { + termData = new TermData(numSegments, this); + termDataPerPosition[termPosition] = termData; + } + return termData; + } + + protected TermData getTermData(int termPosition) { + return termDataPerPosition[termPosition]; + } + + protected boolean areAllTermsMatching() { + assert numTermsMatching <= numTerms; + return numTermsMatching == numTerms; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("TermsData("); + builder.append("numSegments=").append(numSegments); + builder.append(", termDataPerPosition=").append(Arrays.asList(termDataPerPosition)); + builder.append(", termsStatsList=["); + for (TermStatistics termStatistics : termStatsList) { + builder.append("{") + .append(termStatistics.term().utf8ToString()) + .append(", ").append(termStatistics.docFreq()) + .append(", ").append(termStatistics.totalTermFreq()) + .append("}"); + } + builder.append("]"); + builder.append(")"); + return builder.toString(); + } + } + + /** + * Holds the {@link TermState} for all the collected {@link Term}, + * for a specific phrase term, for all segments. + */ + protected static class TermData { + + protected final int numSegments; + protected final TermsData termsData; + protected List[] termStatesPerSegment; + protected final List terms; + + protected TermData(int numSegments, TermsData termsData) { + this.numSegments = numSegments; + this.termsData = termsData; + terms = new ArrayList<>(); + } + + /** + * Sets the collected list of {@link TermBytesTermState} for the given segment. + */ + @SuppressWarnings("unchecked") + protected void setTermStatesForSegment(LeafReaderContext leafReaderContext, List termStates) { + if (termStatesPerSegment == null) { + termStatesPerSegment = (List[]) new List[numSegments]; + termsData.numTermsMatching++; + } + termStatesPerSegment[leafReaderContext.ord] = termStates; + } + + /** + * @return The collected list of {@link TermBytesTermState} for the given segment; + * or null if this phrase term does not match in the given segment. + */ + protected List getTermStatesForSegment(LeafReaderContext leafReaderContext) { + assert termStatesPerSegment != null : "No TermState for any segment; the query should have been stopped before"; + return termStatesPerSegment[leafReaderContext.ord]; + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder(); + builder.append("TermData("); + builder.append("termStates="); + if (termStatesPerSegment == null) { + builder.append("null"); + } else { + builder.append(Arrays.asList(termStatesPerSegment)); + } + builder.append(", terms=").append(terms); + builder.append(")"); + return builder.toString(); + } + } + + /** + * Holds a pair of term bytes - term state. + */ + public static class TermBytesTermState { + + protected final BytesRef termBytes; + protected final TermState termState; + + public TermBytesTermState(BytesRef termBytes, TermState termState) { + this.termBytes = termBytes; + this.termState = termState; + } + + @Override + public String toString() { + return "\"" + termBytes.utf8ToString() + "\"->" + termState; + } + } + + /** + * Accumulates the doc freq and total term freq. + */ + public static class TermStats { + + protected final BytesRef termBytes; + protected int docFreq; + protected long totalTermFreq; + + protected TermStats(BytesRef termBytes) { + this.termBytes = termBytes; + } + + public BytesRef getTermBytes() { + return termBytes; + } + + protected void addStats(int docFreq, long totalTermFreq) { + this.docFreq += docFreq; + if (this.totalTermFreq >= 0 && totalTermFreq >= 0) { + this.totalTermFreq += totalTermFreq; + } else { + this.totalTermFreq = -1; + } + } + } + + /** + * Compares segments based of the number of terms they contain. + *

+ * This is used to sort segments incrementally by number of terms. This + * way the first segment to search is the smallest, so a term has the lowest + * probability to match in this segment. And if the term does not match, + * we credit unused expansions when searching the other next segments. + */ + protected class SegmentTermsSizeComparator implements Comparator { + + private static final String COMPARISON_ERROR_MESSAGE = "Segment comparison error"; + + @Override + public int compare(LeafReaderContext leafReaderContext1, LeafReaderContext leafReaderContext2) { + try { + return Long.compare(getTermsSize(leafReaderContext1), getTermsSize(leafReaderContext2)); + } catch (IOException e) { + throw new RuntimeException(COMPARISON_ERROR_MESSAGE, e); + } + } + + protected List createTermsSizeSortedCopyOf(List segments) throws IOException { + List copy = new ArrayList<>(segments); + try { + copy.sort(this); + } catch (RuntimeException e) { + if (COMPARISON_ERROR_MESSAGE.equals(e.getMessage())) { + throw (IOException) e.getCause(); + } + throw e; + } + return copy; + } + + private long getTermsSize(LeafReaderContext leafReaderContext) throws IOException { + Terms terms = leafReaderContext.reader().terms(field); + return terms == null ? 0 : terms.size(); + } + } + + /** + * Test counters incremented when assertions are enabled. Used only when testing. + */ + protected static class TestCounters { + + private static final TestCounters SINGLETON = new TestCounters(); + + protected long singleTermAnalysisCount; + protected long multiTermAnalysisCount; + protected long expansionCount; + protected long segmentUseCount; + protected long segmentSkipCount; + protected long queryEarlyStopCount; + + protected static TestCounters get() { + return SINGLETON; + } + + protected boolean incSingleTermAnalysisCount() { + singleTermAnalysisCount++; + return true; + } + + protected boolean incMultiTermAnalysisCount() { + multiTermAnalysisCount++; + return true; + } + + protected boolean incExpansionCount() { + expansionCount++; + return true; + } + + protected boolean incSegmentUseCount() { + segmentUseCount++; + return true; + } + + protected boolean incSegmentSkipCount() { + segmentSkipCount++; + return true; + } + + protected boolean incQueryEarlyStopCount() { + queryEarlyStopCount++; + return true; + } + + protected void clear() { + singleTermAnalysisCount = 0; + multiTermAnalysisCount = 0; + expansionCount = 0; + segmentUseCount = 0; + segmentSkipCount = 0; + queryEarlyStopCount = 0; + } + +// protected void printTestCounters(TermsData termsData) { +// System.out.println("singleTermAnalysisCount=" + singleTermAnalysisCount); +// System.out.println("multiTermAnalysisCount=" + multiTermAnalysisCount); +// System.out.println("expansionCount=" + expansionCount); +// System.out.println("segmentUseCount=" + segmentUseCount); +// System.out.println("segmentSkipCount=" + segmentSkipCount); +// System.out.println("queryEarlyStopCount=" + queryEarlyStopCount); +// System.out.println(termsData); +// } + } +} \ No newline at end of file diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestPhraseWildcardQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestPhraseWildcardQuery.java new file mode 100644 index 00000000000..6d641ac4f28 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestPhraseWildcardQuery.java @@ -0,0 +1,570 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanQuery; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; + +import static org.apache.lucene.search.PhraseWildcardQuery.TestCounters; + +/** + * Tests {@link PhraseWildcardQuery}. + *

+ * The main goal of this class is to verify that {@link PhraseWildcardQuery} + * has the same ranking and same scoring than both {@link MultiPhraseQuery} + * and {@link SpanNearQuery}. + *

+ * Note that the ranking and scoring are equal if the segment optimization + * is disabled, otherwise it may change the score, but the ranking is most + * often the same. + */ +public class TestPhraseWildcardQuery extends LuceneTestCase { + + protected static final int MAX_DOCS = 1000; + protected static final String[] FIELDS = {"title", "author", "category", "other"}; + + protected Directory directory; + protected IndexReader reader; + protected IndexSearcher searcher; + protected boolean differentScoreExpectedForSpanNearQuery; + + @Override + public void setUp() throws Exception { + super.setUp(); + directory = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random(), directory); + iw.setDoRandomForceMerge(false); // Keep the segments separated. + addSegments(iw); + reader = iw.getReader(); + iw.close(); + searcher = newSearcher(reader); + } + + @Override + public void tearDown() throws Exception { + reader.close(); + directory.close(); + super.tearDown(); + } + + public void testOneMultiTerm() throws Exception { + searchAndCheckResults(field(1), 100, "eric", "br*"); + assertEquals(1, TestCounters.get().singleTermAnalysisCount); + assertEquals(1, TestCounters.get().multiTermAnalysisCount); + assertEquals(4, TestCounters.get().segmentUseCount); + assertEquals(0, TestCounters.get().segmentSkipCount); + } + + public void testTwoMultiTerms() throws Exception { + searchAndCheckResults(field(1), 100, "e*", "b*"); + assertEquals(0, TestCounters.get().singleTermAnalysisCount); + assertEquals(2, TestCounters.get().multiTermAnalysisCount); + assertEquals(4, TestCounters.get().segmentUseCount); + assertEquals(0, TestCounters.get().segmentSkipCount); + + expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> { + searchAndCheckResults(field(2), 100, "tim*", "t*"); + assertEquals(0, TestCounters.get().singleTermAnalysisCount); + assertEquals(2, TestCounters.get().multiTermAnalysisCount); + assertEquals(2, TestCounters.get().segmentUseCount); + assertEquals(1, TestCounters.get().segmentSkipCount); + }); + } + + public void testThreeMultiTerms() throws Exception { + searchAndCheckResults(field(0), 100, "t*", "ut?pi?", "e*"); + assertEquals(0, TestCounters.get().singleTermAnalysisCount); + assertEquals(3, TestCounters.get().multiTermAnalysisCount); + assertEquals(4, TestCounters.get().segmentUseCount); + assertEquals(1, TestCounters.get().segmentSkipCount); + + searchAndCheckResults(field(0), 100, "t?e", "u*", "e*"); + assertEquals(0, TestCounters.get().singleTermAnalysisCount); + assertEquals(3, TestCounters.get().multiTermAnalysisCount); + assertEquals(4, TestCounters.get().segmentUseCount); + assertEquals(1, TestCounters.get().segmentSkipCount); + + expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> { + searchAndCheckResults(field(0), 100, "t?e", "b*", "b*"); + assertEquals(0, TestCounters.get().singleTermAnalysisCount); + assertEquals(3, TestCounters.get().multiTermAnalysisCount); + assertEquals(4, TestCounters.get().segmentUseCount); + assertEquals(1, TestCounters.get().segmentSkipCount); + }); + } + + public void testOneSingleTermTwoMultiTerms() throws Exception { + searchAndCheckResults(field(0), 100, "t*", "utopia", "e*"); + assertEquals(1, TestCounters.get().singleTermAnalysisCount); + assertEquals(2, TestCounters.get().multiTermAnalysisCount); + assertEquals(4, TestCounters.get().segmentUseCount); + assertEquals(1, TestCounters.get().segmentSkipCount); + + searchAndCheckResults(field(0), 100, "t?e", "utopia", "e*"); + assertEquals(1, TestCounters.get().singleTermAnalysisCount); + assertEquals(2, TestCounters.get().multiTermAnalysisCount); + assertEquals(4, TestCounters.get().segmentUseCount); + assertEquals(1, TestCounters.get().segmentSkipCount); + + searchAndCheckResults(field(0), 100, "t?a", "utopia", "e*"); + assertEquals(1, TestCounters.get().singleTermAnalysisCount); + assertEquals(1, TestCounters.get().multiTermAnalysisCount); + assertEquals(3, TestCounters.get().segmentUseCount); + assertEquals(2, TestCounters.get().segmentSkipCount); + } + + public void testTermDoesNotMatch() throws Exception { + searchAndCheckResults(field(0), 100, "nomatch", "e*"); + // We expect that createWeight() is not called because the first term does + // not match so the query is early stopped without multi-term expansion. + assertEquals(1, TestCounters.get().singleTermAnalysisCount); + assertEquals(0, TestCounters.get().multiTermAnalysisCount); + assertEquals(2, TestCounters.get().segmentUseCount); + assertEquals(2, TestCounters.get().segmentSkipCount); + + searchAndCheckResults(field(0), 100, "t*", "nomatch", "e*"); + assertEquals(1, TestCounters.get().singleTermAnalysisCount); + assertEquals(0, TestCounters.get().multiTermAnalysisCount); + assertEquals(2, TestCounters.get().segmentUseCount); + assertEquals(2, TestCounters.get().segmentSkipCount); + } + + public void testNoMultiTerm() throws Exception { + searchAndCheckResults(field(0), 100, "the", "utopia"); + searchAndCheckResults(field(0), 100, "utopia", "the"); + searchAndCheckResults(field(0), 100, "the", "experiment"); + } + + public void testMaxExpansions() throws Exception { + // The limit on the number of expansions is different with PhraseWildcardQuery + // because it applies to each segments individually, and not globally unlike + // MultiPhraseQuery and SpanMultiTermQueryWrapper. + // Here we verify the total number of expansions directly from test stats + // inside PhraseWildcardQuery. + + clearTestCounters(); + searcher.search(phraseWildcardQuery(field(1), 3, 0, true, "e*", "b*"), MAX_DOCS); + // We expect 3 expansions even if both multi-terms have potentially more expansions. + assertEquals(3, TestCounters.get().expansionCount); + + clearTestCounters(); + searcher.search(phraseWildcardQuery(field(0), 4, 0, true, "t?e", "utopia", "e*"), MAX_DOCS); + // We expect 2 expansions since the "utopia" term matches only in the + // first segment, so there is no expansion for the second segment. + assertEquals(2, TestCounters.get().expansionCount); + } + + public void testSegmentOptimizationSingleField() throws Exception { + searchAndCheckResults(field(0), 100, 0, true, "b*", "e*"); + // Both multi-terms are present in both segments. + // So expecting 4 segment accesses. + assertEquals(4, TestCounters.get().segmentUseCount); + assertEquals(0, TestCounters.get().segmentSkipCount); + assertEquals(0, TestCounters.get().queryEarlyStopCount); + + searchAndCheckResults(field(0), 100, 0, true, "t?e", "b*", "e*"); + // "t?e" matches only in the first segment. This term adds 2 segment accesses and 1 segment skip. + // The other multi-terms match in the first segment. Each one adds 1 segment access. + // So expecting 3 segment accesses and 1 segment skips. + assertEquals(4, TestCounters.get().segmentUseCount); + assertEquals(1, TestCounters.get().segmentSkipCount); + assertEquals(0, TestCounters.get().queryEarlyStopCount); + + searchAndCheckResults(field(0), 100, 0, true, "t?e", "blind", "e*"); + assertEquals(3, TestCounters.get().segmentUseCount); + assertEquals(2, TestCounters.get().segmentSkipCount); + assertEquals(1, TestCounters.get().queryEarlyStopCount); + + expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> { + searchAndCheckResults(field(2), 100, 0, true, "tim*", "t*"); + assertEquals(2, TestCounters.get().segmentUseCount); + assertEquals(1, TestCounters.get().segmentSkipCount); + assertEquals(0, TestCounters.get().queryEarlyStopCount); + }); + } + + public void testMultiplePhraseWildcards() throws Exception { + searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{ + new String[]{"e*", "b*"}, + new String[]{"t?e", "utopia"} + }); + searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{ + new String[]{"e*", "b*"}, + new String[]{"d*", "b*"} + }); + searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{ + new String[]{"e*", "b*"}, + new String[]{"t?e", "utopia"}, + new String[]{"d*", "b*"} + }); + expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> + searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{ + new String[]{"e*", "b*"}, + new String[]{"b*", "b*"} + })); + expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> + searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{ + new String[]{"e*", "b*"}, + new String[]{"b*", "b*"}, + new String[]{"t?e", "utopia"} + })); + searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{ + new String[]{"e*", "b*"}, + new String[]{"e*", "b*"} + }); + searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{ + new String[]{"e*", "b*"}, + new String[]{"t?e", "utopia"}, + new String[]{"e*", "b*"} + }); + } + + public void testToString() { + Query testQuery = phraseWildcardQuery(field(0), 100, 0, true, "t?e", "b*", "e*"); + assertEquals("phraseWildcard(title:\"t?e b* e*\")", testQuery.toString()); + + testQuery = phraseWildcardQuery(field(0), 100, 1, true, "t?e", "utopia", "e*"); + assertEquals("phraseWildcard(\"t?e utopia e*\"~1)", testQuery.toString(field(0))); + + testQuery = phraseWildcardQuery(field(0), 100, 1, true, "t?e", "b*", "b*"); + assertEquals("phraseWildcard(\"t?e b* b*\"~1)", testQuery.toString(field(0))); + } + + public void testExplain() throws IOException { + Query testQuery = phraseWildcardQuery(field(0), 100, 0, true, "t?e", "b*", "b*"); + + // Verify the standard way to get the query explanation. + for (ScoreDoc scoreDoc : searcher.search(testQuery, MAX_DOCS).scoreDocs) { + Explanation explanation = searcher.explain(testQuery, scoreDoc.doc); + assertTrue(explanation.getValue().doubleValue() > 0); + assertEquals("weight(phraseWildcard(title:\"t?e b* b*\") in 1) [AssertingSimilarity], result of:", explanation.getDescription()); + } + + // Verify that if we call PhraseWildcardQuery.PhraseWildcardWeight.scorer() twice, + // the scoring is correct (even if it is not the standard path expected by the scorer() method). + int resultCount = 0; + Weight weight = testQuery.createWeight(searcher, ScoreMode.TOP_SCORES, 1); + for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) { + Scorer scorer = weight.scorer(leafReaderContext); + if (scorer != null) { + DocIdSetIterator iterator = scorer.iterator(); + while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + resultCount++; + } + } + } + assertEquals(1, resultCount); + + int explanationWithNonNullScoreCount = 0; + for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) { + Explanation explanation = weight.explain(leafReaderContext, 1); + if (explanation.getValue().doubleValue() > 0) { + explanationWithNonNullScoreCount++; + } + } + assertEquals(1, explanationWithNonNullScoreCount); + } + + /** + * With two similar multi-terms which expansions are subsets (e.g. "tim*" and "t*"), + * we expect {@link PhraseWildcardQuery} and {@link MultiPhraseQuery} to + * have the same scores, but {@link SpanNearQuery} scores are different. + */ + protected void expectDifferentScoreForSpanNearQueryWithMultiTermSubset(RunnableWithIOException runnable) throws IOException { + try { + differentScoreExpectedForSpanNearQuery = true; + runnable.run(); + } finally { + differentScoreExpectedForSpanNearQuery = false; + } + } + + /** + * Compares {@link PhraseWildcardQuery} to both {@link MultiPhraseQuery} + * and {@link SpanNearQuery}. + */ + protected void searchAndCheckResults(String field, int maxExpansions, String... terms) throws IOException { + for (int slop = 0; slop <= 1; slop++) { + searchAndCheckResults(field, maxExpansions, slop, false, terms); + searchAndCheckResults(field, maxExpansions, slop, true, terms); + } + } + + protected void searchAndCheckResults(String field, int maxExpansions, int slop, + boolean segmentOptimizationEnabled, String... terms) throws IOException { + searchAndCheckSameResults( + phraseWildcardQuery(field, maxExpansions, slop, segmentOptimizationEnabled, terms), + multiPhraseQuery(field, maxExpansions, slop, terms), + spanNearQuery(field, slop, terms), + segmentOptimizationEnabled); + } + + protected void searchAndCheckResultsMultiplePhraseWildcards(String[] fields, int maxExpansions, + int slop, String[][] multiPhraseTerms) throws IOException { + searchAndCheckResultsMultiplePhraseWildcards(fields, maxExpansions, slop, false, multiPhraseTerms); + searchAndCheckResultsMultiplePhraseWildcards(fields, maxExpansions, slop, true, multiPhraseTerms); + } + + protected void searchAndCheckResultsMultiplePhraseWildcards(String[] fields, int maxExpansions, int slop, + boolean segmentOptimizationEnabled, String[][] multiPhraseTerms) throws IOException { + BooleanQuery.Builder phraseWildcardQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder multiPhraseQueryBuilder = new BooleanQuery.Builder(); + BooleanQuery.Builder spanNearQueryBuilder = new BooleanQuery.Builder(); + for (String[] terms : multiPhraseTerms) { + BooleanClause.Occur occur = random().nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD; + phraseWildcardQueryBuilder.add(disMaxQuery(phraseWildcardQueries(fields, maxExpansions, slop, segmentOptimizationEnabled, terms)), occur); + multiPhraseQueryBuilder.add(disMaxQuery(multiPhraseQueries(fields, maxExpansions, slop, terms)), occur); + spanNearQueryBuilder.add(disMaxQuery(spanNearQueries(fields, slop, terms)), occur); + } + searchAndCheckSameResults( + phraseWildcardQueryBuilder.build(), + multiPhraseQueryBuilder.build(), + spanNearQueryBuilder.build(), + segmentOptimizationEnabled + ); + } + + protected Query disMaxQuery(Query... disjuncts) { + return new DisjunctionMaxQuery(Arrays.asList(disjuncts), 0.1f); + } + + protected Query[] phraseWildcardQueries(String[] fields, int maxExpansions, int slop, boolean segmentOptimizationEnabled, String... terms) { + Query[] queries = new Query[fields.length]; + for (int i = 0; i < fields.length; i++) { + queries[i] = phraseWildcardQuery(fields[i], maxExpansions, slop, segmentOptimizationEnabled, terms); + } + return queries; + } + + protected Query[] multiPhraseQueries(String[] fields, int maxExpansions, int slop, String... terms) throws IOException { + Query[] queries = new Query[fields.length]; + for (int i = 0; i < fields.length; i++) { + queries[i] = multiPhraseQuery(fields[i], maxExpansions, slop, terms); + } + return queries; + } + + protected Query[] spanNearQueries(String[] fields, int slop, String... terms) { + Query[] queries = new Query[fields.length]; + for (int i = 0; i < fields.length; i++) { + queries[i] = spanNearQuery(fields[i], slop, terms); + } + return queries; + } + + protected void searchAndCheckSameResults(Query testQuery, Query multiPhraseQuery, Query spanNearQuery, boolean segmentOptimizationEnabled) throws IOException { + // Search and compare results with MultiPhraseQuery. + // Do not compare the scores if the segment optimization is enabled because + // it changes the score (but not the result ranking). + boolean sameScoreExpected = !segmentOptimizationEnabled; + searchAndCheckSameResults(testQuery, multiPhraseQuery, sameScoreExpected); + + // Clear the test stats to verify them only with the last test query execution. + clearTestCounters(); + // Search and compare results with SpanNearQuery. + sameScoreExpected = !segmentOptimizationEnabled && !differentScoreExpectedForSpanNearQuery; + searchAndCheckSameResults(testQuery, spanNearQuery, sameScoreExpected); + } + + protected void clearTestCounters() { + TestCounters.get().clear(); + } + + protected void searchAndCheckSameResults(Query testQuery, Query referenceQuery, + boolean compareScores) throws IOException { + ScoreDoc[] testResults = searcher.search(testQuery, MAX_DOCS).scoreDocs; + ScoreDoc[] referenceResults = searcher.search(referenceQuery, MAX_DOCS).scoreDocs; + assertEquals("Number of results differ when comparing to " + referenceQuery.getClass().getSimpleName(), + referenceResults.length, testResults.length); + if (compareScores) { + for (int i = 0; i < testResults.length; i++) { + ScoreDoc testResult = testResults[i]; + ScoreDoc referenceResult = referenceResults[i]; + assertTrue("Result " + i + " differ when comparing to " + referenceQuery.getClass().getSimpleName() + + "\ntestResults=" + Arrays.toString(testResults) + "\nreferenceResults=" + Arrays.toString(referenceResults), + equals(testResult, referenceResult)); + } + } else { + Set testResultDocIds = Arrays.stream(testResults).map(scoreDoc -> scoreDoc.doc).collect(Collectors.toSet()); + Set referenceResultDocIds = Arrays.stream(referenceResults).map(scoreDoc -> scoreDoc.doc).collect(Collectors.toSet()); + assertEquals("Results differ when comparing to " + referenceQuery.getClass().getSimpleName() + + " ignoring score\ntestResults=" + Arrays.toString(testResults) + "\nreferenceResults=" + Arrays.toString(referenceResults), + referenceResultDocIds, testResultDocIds); + } + } + + protected PhraseWildcardQuery phraseWildcardQuery(String field, int maxExpansions, + int slop, boolean segmentOptimizationEnabled, String... terms) { + PhraseWildcardQuery.Builder builder = createPhraseWildcardQueryBuilder(field, maxExpansions, segmentOptimizationEnabled) + .setSlop(slop); + for (String term : terms) { + if (term.contains("*") || term.contains("?")) { + builder.addMultiTerm(new WildcardQuery(new Term(field, term))); + } else { + builder.addTerm(new BytesRef(term)); + } + } + return builder.build(); + } + + protected PhraseWildcardQuery.Builder createPhraseWildcardQueryBuilder( + String field, int maxExpansions, boolean segmentOptimizationEnabled) { + return new PhraseWildcardQuery.Builder(field, maxExpansions, segmentOptimizationEnabled); + } + + protected SpanNearQuery spanNearQuery(String field, int slop, String... terms) { + SpanQuery[] spanQueries = new SpanQuery[terms.length]; + for (int i = 0; i < terms.length; i++) { + String term = terms[i]; + spanQueries[i] = term.contains("*") || term.contains("?") ? + new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term(field, term))) + : new SpanTermQuery(new Term(field, term)); + } + return new SpanNearQuery(spanQueries, slop, true); + } + + protected MultiPhraseQuery multiPhraseQuery(String field, int maxExpansions, int slop, String... terms) throws IOException { + MultiPhraseQuery.Builder builder = new MultiPhraseQuery.Builder() + .setSlop(slop); + for (String term : terms) { + if (term.contains("*") || term.contains("?")) { + Term[] expansions = expandMultiTerm(field, term, maxExpansions); + if (expansions.length > 0) { + builder.add(expansions); + } else { + builder.add(new Term(field, "non-matching-term")); + } + } else { + builder.add(new Term(field, term)); + } + } + return builder.build(); + } + + protected Term[] expandMultiTerm(String field, String term, int maxExpansions) throws IOException { + if (maxExpansions == 0) { + return new Term[0]; + } + Set expansions = new HashSet<>(); + WildcardQuery wq = new WildcardQuery(new Term(field, term)); + expansion: + for (final LeafReaderContext ctx : reader.leaves()) { + Terms terms = ctx.reader().terms(field); + if (terms != null) { + TermsEnum termsEnum = wq.getTermsEnum(terms); + while (termsEnum.next() != null) { + expansions.add(new Term(field, termsEnum.term())); + if (expansions.size() >= maxExpansions) { + break expansion; + } + } + } + } + return expansions.toArray(new Term[0]); + } + + protected static boolean equals(ScoreDoc result1, ScoreDoc result2) { + // Due to randomness, the value of the score comparison epsilon varies much. + // We take 1E-1 epsilon to ensure the test do not flap. + return result1.doc == result2.doc && (Math.abs(result1.score - result2.score) < 1E-1); + } + + protected void addSegments(RandomIndexWriter iw) throws IOException { + // First segment. + addDocs(iw, + doc( + field(field(0), "time conversion"), + field(field(1), "eric hawk"), + field(field(2), "time travel") + ), + doc( + field(field(0), "the blinking books"), + field(field(1), "donald ever"), + field(field(2), "time travel") + ), + doc( + field(field(0), "the utopia experiment"), + field(field(1), "dylan brief"), + field(field(2), "utopia"), + field(field(3), "travelling to utopiapolis") + ) + ); + iw.commit(); + + // Second segment. + // No field(2). + addDocs(iw, + doc( + field(field(0), "serene evasion"), + field(field(1), "eric brown") + ), + doc( + field(field(0), "my blind experiment"), + field(field(1), "eric bright") + ), + doc( + field(field(3), "two times travel") + ) + ); + iw.commit(); + } + + protected String field(int index) { + return FIELDS[index]; + } + + protected static void addDocs(RandomIndexWriter iw, Document... docs) throws IOException { + iw.addDocuments(Arrays.asList(docs)); + } + + protected static Document doc(Field... fields) { + Document doc = new Document(); + for (Field field : fields) { + doc.add(field); + } + return doc; + } + + protected static Field field(String field, String fieldValue) { + return newTextField(field, fieldValue, Field.Store.NO); + } + + private interface RunnableWithIOException { + + void run() throws IOException; + } +}