mirror of https://github.com/apache/lucene.git
LUCENE-8983: Add PhraseWildcardQuery to control multi-terms expansions in phrase.
This commit is contained in:
parent
f70e21c91c
commit
8485b5a939
|
@ -143,6 +143,8 @@ Other
|
||||||
|
|
||||||
* LUCENE-9046: Fix wrong example in Javadoc of TermInSetQuery (Namgyu Kim)
|
* LUCENE-9046: Fix wrong example in Javadoc of TermInSetQuery (Namgyu Kim)
|
||||||
|
|
||||||
|
* LUCENE-8983: Add sandbox PhraseWildcardQuery to control multi-terms expansions in a phrase. (Bruno Roustant)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* Upgrade forbiddenapis to version 2.7; upgrade Groovy to 2.4.17. (Uwe Schindler)
|
* Upgrade forbiddenapis to version 2.7; upgrade Groovy to 2.4.17. (Uwe Schindler)
|
||||||
|
|
|
@ -292,11 +292,14 @@ public abstract class MultiTermQuery extends Query {
|
||||||
*/
|
*/
|
||||||
protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
|
protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
|
||||||
|
|
||||||
/** Convenience method, if no attributes are needed:
|
/**
|
||||||
* This simply passes empty attributes and is equal to:
|
* Constructs an enumeration that expands the pattern term.
|
||||||
* <code>getTermsEnum(terms, new AttributeSource())</code>
|
* This method should only be called if the field exists (ie,
|
||||||
|
* implementations can assume the field does exist).
|
||||||
|
* This method never returns null.
|
||||||
|
* The returned TermsEnum is positioned to the first matching term.
|
||||||
*/
|
*/
|
||||||
protected final TermsEnum getTermsEnum(Terms terms) throws IOException {
|
public final TermsEnum getTermsEnum(Terms terms) throws IOException {
|
||||||
return getTermsEnum(terms, new AttributeSource());
|
return getTermsEnum(terms, new AttributeSource());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -330,6 +330,22 @@ public class PhraseQuery extends Query {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, List<Term> terms) {
|
||||||
|
this.postings = postings;
|
||||||
|
this.impacts = impacts;
|
||||||
|
this.position = position;
|
||||||
|
nTerms = terms == null ? 0 : terms.size();
|
||||||
|
if (nTerms > 0) {
|
||||||
|
Term[] terms2 = terms.toArray(new Term[0]);
|
||||||
|
if (nTerms > 1) {
|
||||||
|
Arrays.sort(terms2);
|
||||||
|
}
|
||||||
|
this.terms = terms2;
|
||||||
|
} else {
|
||||||
|
this.terms = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(PostingsAndFreq other) {
|
public int compareTo(PostingsAndFreq other) {
|
||||||
if (position != other.position) {
|
if (position != other.position) {
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,570 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.search;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.Terms;
|
||||||
|
import org.apache.lucene.index.TermsEnum;
|
||||||
|
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
|
||||||
|
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
import static org.apache.lucene.search.PhraseWildcardQuery.TestCounters;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests {@link PhraseWildcardQuery}.
|
||||||
|
* <p>
|
||||||
|
* The main goal of this class is to verify that {@link PhraseWildcardQuery}
|
||||||
|
* has the same ranking and same scoring than both {@link MultiPhraseQuery}
|
||||||
|
* and {@link SpanNearQuery}.
|
||||||
|
* <p>
|
||||||
|
* Note that the ranking and scoring are equal if the segment optimization
|
||||||
|
* is disabled, otherwise it may change the score, but the ranking is most
|
||||||
|
* often the same.
|
||||||
|
*/
|
||||||
|
public class TestPhraseWildcardQuery extends LuceneTestCase {
|
||||||
|
|
||||||
|
protected static final int MAX_DOCS = 1000;
|
||||||
|
protected static final String[] FIELDS = {"title", "author", "category", "other"};
|
||||||
|
|
||||||
|
protected Directory directory;
|
||||||
|
protected IndexReader reader;
|
||||||
|
protected IndexSearcher searcher;
|
||||||
|
protected boolean differentScoreExpectedForSpanNearQuery;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
directory = newDirectory();
|
||||||
|
RandomIndexWriter iw = new RandomIndexWriter(random(), directory);
|
||||||
|
iw.setDoRandomForceMerge(false); // Keep the segments separated.
|
||||||
|
addSegments(iw);
|
||||||
|
reader = iw.getReader();
|
||||||
|
iw.close();
|
||||||
|
searcher = newSearcher(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
reader.close();
|
||||||
|
directory.close();
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOneMultiTerm() throws Exception {
|
||||||
|
searchAndCheckResults(field(1), 100, "eric", "br*");
|
||||||
|
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(1, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(4, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(0, TestCounters.get().segmentSkipCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTwoMultiTerms() throws Exception {
|
||||||
|
searchAndCheckResults(field(1), 100, "e*", "b*");
|
||||||
|
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(2, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(4, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(0, TestCounters.get().segmentSkipCount);
|
||||||
|
|
||||||
|
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> {
|
||||||
|
searchAndCheckResults(field(2), 100, "tim*", "t*");
|
||||||
|
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(2, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(2, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(1, TestCounters.get().segmentSkipCount);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testThreeMultiTerms() throws Exception {
|
||||||
|
searchAndCheckResults(field(0), 100, "t*", "ut?pi?", "e*");
|
||||||
|
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(3, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(4, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(1, TestCounters.get().segmentSkipCount);
|
||||||
|
|
||||||
|
searchAndCheckResults(field(0), 100, "t?e", "u*", "e*");
|
||||||
|
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(3, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(4, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(1, TestCounters.get().segmentSkipCount);
|
||||||
|
|
||||||
|
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> {
|
||||||
|
searchAndCheckResults(field(0), 100, "t?e", "b*", "b*");
|
||||||
|
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(3, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(4, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(1, TestCounters.get().segmentSkipCount);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOneSingleTermTwoMultiTerms() throws Exception {
|
||||||
|
searchAndCheckResults(field(0), 100, "t*", "utopia", "e*");
|
||||||
|
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(2, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(4, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(1, TestCounters.get().segmentSkipCount);
|
||||||
|
|
||||||
|
searchAndCheckResults(field(0), 100, "t?e", "utopia", "e*");
|
||||||
|
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(2, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(4, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(1, TestCounters.get().segmentSkipCount);
|
||||||
|
|
||||||
|
searchAndCheckResults(field(0), 100, "t?a", "utopia", "e*");
|
||||||
|
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(1, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(3, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(2, TestCounters.get().segmentSkipCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testTermDoesNotMatch() throws Exception {
|
||||||
|
searchAndCheckResults(field(0), 100, "nomatch", "e*");
|
||||||
|
// We expect that createWeight() is not called because the first term does
|
||||||
|
// not match so the query is early stopped without multi-term expansion.
|
||||||
|
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(0, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(2, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(2, TestCounters.get().segmentSkipCount);
|
||||||
|
|
||||||
|
searchAndCheckResults(field(0), 100, "t*", "nomatch", "e*");
|
||||||
|
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
|
||||||
|
assertEquals(0, TestCounters.get().multiTermAnalysisCount);
|
||||||
|
assertEquals(2, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(2, TestCounters.get().segmentSkipCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNoMultiTerm() throws Exception {
|
||||||
|
searchAndCheckResults(field(0), 100, "the", "utopia");
|
||||||
|
searchAndCheckResults(field(0), 100, "utopia", "the");
|
||||||
|
searchAndCheckResults(field(0), 100, "the", "experiment");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMaxExpansions() throws Exception {
|
||||||
|
// The limit on the number of expansions is different with PhraseWildcardQuery
|
||||||
|
// because it applies to each segments individually, and not globally unlike
|
||||||
|
// MultiPhraseQuery and SpanMultiTermQueryWrapper.
|
||||||
|
// Here we verify the total number of expansions directly from test stats
|
||||||
|
// inside PhraseWildcardQuery.
|
||||||
|
|
||||||
|
clearTestCounters();
|
||||||
|
searcher.search(phraseWildcardQuery(field(1), 3, 0, true, "e*", "b*"), MAX_DOCS);
|
||||||
|
// We expect 3 expansions even if both multi-terms have potentially more expansions.
|
||||||
|
assertEquals(3, TestCounters.get().expansionCount);
|
||||||
|
|
||||||
|
clearTestCounters();
|
||||||
|
searcher.search(phraseWildcardQuery(field(0), 4, 0, true, "t?e", "utopia", "e*"), MAX_DOCS);
|
||||||
|
// We expect 2 expansions since the "utopia" term matches only in the
|
||||||
|
// first segment, so there is no expansion for the second segment.
|
||||||
|
assertEquals(2, TestCounters.get().expansionCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSegmentOptimizationSingleField() throws Exception {
|
||||||
|
searchAndCheckResults(field(0), 100, 0, true, "b*", "e*");
|
||||||
|
// Both multi-terms are present in both segments.
|
||||||
|
// So expecting 4 segment accesses.
|
||||||
|
assertEquals(4, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(0, TestCounters.get().segmentSkipCount);
|
||||||
|
assertEquals(0, TestCounters.get().queryEarlyStopCount);
|
||||||
|
|
||||||
|
searchAndCheckResults(field(0), 100, 0, true, "t?e", "b*", "e*");
|
||||||
|
// "t?e" matches only in the first segment. This term adds 2 segment accesses and 1 segment skip.
|
||||||
|
// The other multi-terms match in the first segment. Each one adds 1 segment access.
|
||||||
|
// So expecting 3 segment accesses and 1 segment skips.
|
||||||
|
assertEquals(4, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(1, TestCounters.get().segmentSkipCount);
|
||||||
|
assertEquals(0, TestCounters.get().queryEarlyStopCount);
|
||||||
|
|
||||||
|
searchAndCheckResults(field(0), 100, 0, true, "t?e", "blind", "e*");
|
||||||
|
assertEquals(3, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(2, TestCounters.get().segmentSkipCount);
|
||||||
|
assertEquals(1, TestCounters.get().queryEarlyStopCount);
|
||||||
|
|
||||||
|
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> {
|
||||||
|
searchAndCheckResults(field(2), 100, 0, true, "tim*", "t*");
|
||||||
|
assertEquals(2, TestCounters.get().segmentUseCount);
|
||||||
|
assertEquals(1, TestCounters.get().segmentSkipCount);
|
||||||
|
assertEquals(0, TestCounters.get().queryEarlyStopCount);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultiplePhraseWildcards() throws Exception {
|
||||||
|
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
|
||||||
|
new String[]{"e*", "b*"},
|
||||||
|
new String[]{"t?e", "utopia"}
|
||||||
|
});
|
||||||
|
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
|
||||||
|
new String[]{"e*", "b*"},
|
||||||
|
new String[]{"d*", "b*"}
|
||||||
|
});
|
||||||
|
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
|
||||||
|
new String[]{"e*", "b*"},
|
||||||
|
new String[]{"t?e", "utopia"},
|
||||||
|
new String[]{"d*", "b*"}
|
||||||
|
});
|
||||||
|
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() ->
|
||||||
|
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
|
||||||
|
new String[]{"e*", "b*"},
|
||||||
|
new String[]{"b*", "b*"}
|
||||||
|
}));
|
||||||
|
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() ->
|
||||||
|
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
|
||||||
|
new String[]{"e*", "b*"},
|
||||||
|
new String[]{"b*", "b*"},
|
||||||
|
new String[]{"t?e", "utopia"}
|
||||||
|
}));
|
||||||
|
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
|
||||||
|
new String[]{"e*", "b*"},
|
||||||
|
new String[]{"e*", "b*"}
|
||||||
|
});
|
||||||
|
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
|
||||||
|
new String[]{"e*", "b*"},
|
||||||
|
new String[]{"t?e", "utopia"},
|
||||||
|
new String[]{"e*", "b*"}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testToString() {
|
||||||
|
Query testQuery = phraseWildcardQuery(field(0), 100, 0, true, "t?e", "b*", "e*");
|
||||||
|
assertEquals("phraseWildcard(title:\"t?e b* e*\")", testQuery.toString());
|
||||||
|
|
||||||
|
testQuery = phraseWildcardQuery(field(0), 100, 1, true, "t?e", "utopia", "e*");
|
||||||
|
assertEquals("phraseWildcard(\"t?e utopia e*\"~1)", testQuery.toString(field(0)));
|
||||||
|
|
||||||
|
testQuery = phraseWildcardQuery(field(0), 100, 1, true, "t?e", "b*", "b*");
|
||||||
|
assertEquals("phraseWildcard(\"t?e b* b*\"~1)", testQuery.toString(field(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testExplain() throws IOException {
|
||||||
|
Query testQuery = phraseWildcardQuery(field(0), 100, 0, true, "t?e", "b*", "b*");
|
||||||
|
|
||||||
|
// Verify the standard way to get the query explanation.
|
||||||
|
for (ScoreDoc scoreDoc : searcher.search(testQuery, MAX_DOCS).scoreDocs) {
|
||||||
|
Explanation explanation = searcher.explain(testQuery, scoreDoc.doc);
|
||||||
|
assertTrue(explanation.getValue().doubleValue() > 0);
|
||||||
|
assertEquals("weight(phraseWildcard(title:\"t?e b* b*\") in 1) [AssertingSimilarity], result of:", explanation.getDescription());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify that if we call PhraseWildcardQuery.PhraseWildcardWeight.scorer() twice,
|
||||||
|
// the scoring is correct (even if it is not the standard path expected by the scorer() method).
|
||||||
|
int resultCount = 0;
|
||||||
|
Weight weight = testQuery.createWeight(searcher, ScoreMode.TOP_SCORES, 1);
|
||||||
|
for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) {
|
||||||
|
Scorer scorer = weight.scorer(leafReaderContext);
|
||||||
|
if (scorer != null) {
|
||||||
|
DocIdSetIterator iterator = scorer.iterator();
|
||||||
|
while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
|
resultCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals(1, resultCount);
|
||||||
|
|
||||||
|
int explanationWithNonNullScoreCount = 0;
|
||||||
|
for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) {
|
||||||
|
Explanation explanation = weight.explain(leafReaderContext, 1);
|
||||||
|
if (explanation.getValue().doubleValue() > 0) {
|
||||||
|
explanationWithNonNullScoreCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertEquals(1, explanationWithNonNullScoreCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* With two similar multi-terms which expansions are subsets (e.g. "tim*" and "t*"),
|
||||||
|
* we expect {@link PhraseWildcardQuery} and {@link MultiPhraseQuery} to
|
||||||
|
* have the same scores, but {@link SpanNearQuery} scores are different.
|
||||||
|
*/
|
||||||
|
protected void expectDifferentScoreForSpanNearQueryWithMultiTermSubset(RunnableWithIOException runnable) throws IOException {
|
||||||
|
try {
|
||||||
|
differentScoreExpectedForSpanNearQuery = true;
|
||||||
|
runnable.run();
|
||||||
|
} finally {
|
||||||
|
differentScoreExpectedForSpanNearQuery = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compares {@link PhraseWildcardQuery} to both {@link MultiPhraseQuery}
|
||||||
|
* and {@link SpanNearQuery}.
|
||||||
|
*/
|
||||||
|
protected void searchAndCheckResults(String field, int maxExpansions, String... terms) throws IOException {
|
||||||
|
for (int slop = 0; slop <= 1; slop++) {
|
||||||
|
searchAndCheckResults(field, maxExpansions, slop, false, terms);
|
||||||
|
searchAndCheckResults(field, maxExpansions, slop, true, terms);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void searchAndCheckResults(String field, int maxExpansions, int slop,
|
||||||
|
boolean segmentOptimizationEnabled, String... terms) throws IOException {
|
||||||
|
searchAndCheckSameResults(
|
||||||
|
phraseWildcardQuery(field, maxExpansions, slop, segmentOptimizationEnabled, terms),
|
||||||
|
multiPhraseQuery(field, maxExpansions, slop, terms),
|
||||||
|
spanNearQuery(field, slop, terms),
|
||||||
|
segmentOptimizationEnabled);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void searchAndCheckResultsMultiplePhraseWildcards(String[] fields, int maxExpansions,
|
||||||
|
int slop, String[][] multiPhraseTerms) throws IOException {
|
||||||
|
searchAndCheckResultsMultiplePhraseWildcards(fields, maxExpansions, slop, false, multiPhraseTerms);
|
||||||
|
searchAndCheckResultsMultiplePhraseWildcards(fields, maxExpansions, slop, true, multiPhraseTerms);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void searchAndCheckResultsMultiplePhraseWildcards(String[] fields, int maxExpansions, int slop,
|
||||||
|
boolean segmentOptimizationEnabled, String[][] multiPhraseTerms) throws IOException {
|
||||||
|
BooleanQuery.Builder phraseWildcardQueryBuilder = new BooleanQuery.Builder();
|
||||||
|
BooleanQuery.Builder multiPhraseQueryBuilder = new BooleanQuery.Builder();
|
||||||
|
BooleanQuery.Builder spanNearQueryBuilder = new BooleanQuery.Builder();
|
||||||
|
for (String[] terms : multiPhraseTerms) {
|
||||||
|
BooleanClause.Occur occur = random().nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
|
||||||
|
phraseWildcardQueryBuilder.add(disMaxQuery(phraseWildcardQueries(fields, maxExpansions, slop, segmentOptimizationEnabled, terms)), occur);
|
||||||
|
multiPhraseQueryBuilder.add(disMaxQuery(multiPhraseQueries(fields, maxExpansions, slop, terms)), occur);
|
||||||
|
spanNearQueryBuilder.add(disMaxQuery(spanNearQueries(fields, slop, terms)), occur);
|
||||||
|
}
|
||||||
|
searchAndCheckSameResults(
|
||||||
|
phraseWildcardQueryBuilder.build(),
|
||||||
|
multiPhraseQueryBuilder.build(),
|
||||||
|
spanNearQueryBuilder.build(),
|
||||||
|
segmentOptimizationEnabled
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Query disMaxQuery(Query... disjuncts) {
|
||||||
|
return new DisjunctionMaxQuery(Arrays.asList(disjuncts), 0.1f);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Query[] phraseWildcardQueries(String[] fields, int maxExpansions, int slop, boolean segmentOptimizationEnabled, String... terms) {
|
||||||
|
Query[] queries = new Query[fields.length];
|
||||||
|
for (int i = 0; i < fields.length; i++) {
|
||||||
|
queries[i] = phraseWildcardQuery(fields[i], maxExpansions, slop, segmentOptimizationEnabled, terms);
|
||||||
|
}
|
||||||
|
return queries;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Query[] multiPhraseQueries(String[] fields, int maxExpansions, int slop, String... terms) throws IOException {
|
||||||
|
Query[] queries = new Query[fields.length];
|
||||||
|
for (int i = 0; i < fields.length; i++) {
|
||||||
|
queries[i] = multiPhraseQuery(fields[i], maxExpansions, slop, terms);
|
||||||
|
}
|
||||||
|
return queries;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Query[] spanNearQueries(String[] fields, int slop, String... terms) {
|
||||||
|
Query[] queries = new Query[fields.length];
|
||||||
|
for (int i = 0; i < fields.length; i++) {
|
||||||
|
queries[i] = spanNearQuery(fields[i], slop, terms);
|
||||||
|
}
|
||||||
|
return queries;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void searchAndCheckSameResults(Query testQuery, Query multiPhraseQuery, Query spanNearQuery, boolean segmentOptimizationEnabled) throws IOException {
|
||||||
|
// Search and compare results with MultiPhraseQuery.
|
||||||
|
// Do not compare the scores if the segment optimization is enabled because
|
||||||
|
// it changes the score (but not the result ranking).
|
||||||
|
boolean sameScoreExpected = !segmentOptimizationEnabled;
|
||||||
|
searchAndCheckSameResults(testQuery, multiPhraseQuery, sameScoreExpected);
|
||||||
|
|
||||||
|
// Clear the test stats to verify them only with the last test query execution.
|
||||||
|
clearTestCounters();
|
||||||
|
// Search and compare results with SpanNearQuery.
|
||||||
|
sameScoreExpected = !segmentOptimizationEnabled && !differentScoreExpectedForSpanNearQuery;
|
||||||
|
searchAndCheckSameResults(testQuery, spanNearQuery, sameScoreExpected);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void clearTestCounters() {
|
||||||
|
TestCounters.get().clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void searchAndCheckSameResults(Query testQuery, Query referenceQuery,
|
||||||
|
boolean compareScores) throws IOException {
|
||||||
|
ScoreDoc[] testResults = searcher.search(testQuery, MAX_DOCS).scoreDocs;
|
||||||
|
ScoreDoc[] referenceResults = searcher.search(referenceQuery, MAX_DOCS).scoreDocs;
|
||||||
|
assertEquals("Number of results differ when comparing to " + referenceQuery.getClass().getSimpleName(),
|
||||||
|
referenceResults.length, testResults.length);
|
||||||
|
if (compareScores) {
|
||||||
|
for (int i = 0; i < testResults.length; i++) {
|
||||||
|
ScoreDoc testResult = testResults[i];
|
||||||
|
ScoreDoc referenceResult = referenceResults[i];
|
||||||
|
assertTrue("Result " + i + " differ when comparing to " + referenceQuery.getClass().getSimpleName()
|
||||||
|
+ "\ntestResults=" + Arrays.toString(testResults) + "\nreferenceResults=" + Arrays.toString(referenceResults),
|
||||||
|
equals(testResult, referenceResult));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Set<Integer> testResultDocIds = Arrays.stream(testResults).map(scoreDoc -> scoreDoc.doc).collect(Collectors.toSet());
|
||||||
|
Set<Integer> referenceResultDocIds = Arrays.stream(referenceResults).map(scoreDoc -> scoreDoc.doc).collect(Collectors.toSet());
|
||||||
|
assertEquals("Results differ when comparing to " + referenceQuery.getClass().getSimpleName()
|
||||||
|
+ " ignoring score\ntestResults=" + Arrays.toString(testResults) + "\nreferenceResults=" + Arrays.toString(referenceResults),
|
||||||
|
referenceResultDocIds, testResultDocIds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected PhraseWildcardQuery phraseWildcardQuery(String field, int maxExpansions,
|
||||||
|
int slop, boolean segmentOptimizationEnabled, String... terms) {
|
||||||
|
PhraseWildcardQuery.Builder builder = createPhraseWildcardQueryBuilder(field, maxExpansions, segmentOptimizationEnabled)
|
||||||
|
.setSlop(slop);
|
||||||
|
for (String term : terms) {
|
||||||
|
if (term.contains("*") || term.contains("?")) {
|
||||||
|
builder.addMultiTerm(new WildcardQuery(new Term(field, term)));
|
||||||
|
} else {
|
||||||
|
builder.addTerm(new BytesRef(term));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected PhraseWildcardQuery.Builder createPhraseWildcardQueryBuilder(
|
||||||
|
String field, int maxExpansions, boolean segmentOptimizationEnabled) {
|
||||||
|
return new PhraseWildcardQuery.Builder(field, maxExpansions, segmentOptimizationEnabled);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected SpanNearQuery spanNearQuery(String field, int slop, String... terms) {
|
||||||
|
SpanQuery[] spanQueries = new SpanQuery[terms.length];
|
||||||
|
for (int i = 0; i < terms.length; i++) {
|
||||||
|
String term = terms[i];
|
||||||
|
spanQueries[i] = term.contains("*") || term.contains("?") ?
|
||||||
|
new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term(field, term)))
|
||||||
|
: new SpanTermQuery(new Term(field, term));
|
||||||
|
}
|
||||||
|
return new SpanNearQuery(spanQueries, slop, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected MultiPhraseQuery multiPhraseQuery(String field, int maxExpansions, int slop, String... terms) throws IOException {
|
||||||
|
MultiPhraseQuery.Builder builder = new MultiPhraseQuery.Builder()
|
||||||
|
.setSlop(slop);
|
||||||
|
for (String term : terms) {
|
||||||
|
if (term.contains("*") || term.contains("?")) {
|
||||||
|
Term[] expansions = expandMultiTerm(field, term, maxExpansions);
|
||||||
|
if (expansions.length > 0) {
|
||||||
|
builder.add(expansions);
|
||||||
|
} else {
|
||||||
|
builder.add(new Term(field, "non-matching-term"));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
builder.add(new Term(field, term));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return builder.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected Term[] expandMultiTerm(String field, String term, int maxExpansions) throws IOException {
|
||||||
|
if (maxExpansions == 0) {
|
||||||
|
return new Term[0];
|
||||||
|
}
|
||||||
|
Set<Term> expansions = new HashSet<>();
|
||||||
|
WildcardQuery wq = new WildcardQuery(new Term(field, term));
|
||||||
|
expansion:
|
||||||
|
for (final LeafReaderContext ctx : reader.leaves()) {
|
||||||
|
Terms terms = ctx.reader().terms(field);
|
||||||
|
if (terms != null) {
|
||||||
|
TermsEnum termsEnum = wq.getTermsEnum(terms);
|
||||||
|
while (termsEnum.next() != null) {
|
||||||
|
expansions.add(new Term(field, termsEnum.term()));
|
||||||
|
if (expansions.size() >= maxExpansions) {
|
||||||
|
break expansion;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return expansions.toArray(new Term[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static boolean equals(ScoreDoc result1, ScoreDoc result2) {
|
||||||
|
// Due to randomness, the value of the score comparison epsilon varies much.
|
||||||
|
// We take 1E-1 epsilon to ensure the test do not flap.
|
||||||
|
return result1.doc == result2.doc && (Math.abs(result1.score - result2.score) < 1E-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void addSegments(RandomIndexWriter iw) throws IOException {
|
||||||
|
// First segment.
|
||||||
|
addDocs(iw,
|
||||||
|
doc(
|
||||||
|
field(field(0), "time conversion"),
|
||||||
|
field(field(1), "eric hawk"),
|
||||||
|
field(field(2), "time travel")
|
||||||
|
),
|
||||||
|
doc(
|
||||||
|
field(field(0), "the blinking books"),
|
||||||
|
field(field(1), "donald ever"),
|
||||||
|
field(field(2), "time travel")
|
||||||
|
),
|
||||||
|
doc(
|
||||||
|
field(field(0), "the utopia experiment"),
|
||||||
|
field(field(1), "dylan brief"),
|
||||||
|
field(field(2), "utopia"),
|
||||||
|
field(field(3), "travelling to utopiapolis")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
iw.commit();
|
||||||
|
|
||||||
|
// Second segment.
|
||||||
|
// No field(2).
|
||||||
|
addDocs(iw,
|
||||||
|
doc(
|
||||||
|
field(field(0), "serene evasion"),
|
||||||
|
field(field(1), "eric brown")
|
||||||
|
),
|
||||||
|
doc(
|
||||||
|
field(field(0), "my blind experiment"),
|
||||||
|
field(field(1), "eric bright")
|
||||||
|
),
|
||||||
|
doc(
|
||||||
|
field(field(3), "two times travel")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
iw.commit();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String field(int index) {
|
||||||
|
return FIELDS[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static void addDocs(RandomIndexWriter iw, Document... docs) throws IOException {
|
||||||
|
iw.addDocuments(Arrays.asList(docs));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static Document doc(Field... fields) {
|
||||||
|
Document doc = new Document();
|
||||||
|
for (Field field : fields) {
|
||||||
|
doc.add(field);
|
||||||
|
}
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static Field field(String field, String fieldValue) {
|
||||||
|
return newTextField(field, fieldValue, Field.Store.NO);
|
||||||
|
}
|
||||||
|
|
||||||
|
private interface RunnableWithIOException {
|
||||||
|
|
||||||
|
void run() throws IOException;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue