LUCENE-8983: Add PhraseWildcardQuery to control multi-terms expansions in phrase.

This commit is contained in:
Bruno Roustant 2019-09-18 17:43:53 +02:00 committed by Bruno Roustant
parent c6efd58282
commit d764bf345e
No known key found for this signature in database
GPG Key ID: CD28DABB95360525
5 changed files with 1648 additions and 4 deletions

View File

@ -79,6 +79,8 @@ Other
* LUCENE-9046: Fix wrong example in Javadoc of TermInSetQuery (Namgyu Kim) * LUCENE-9046: Fix wrong example in Javadoc of TermInSetQuery (Namgyu Kim)
* LUCENE-8983: Add sandbox PhraseWildcardQuery to control multi-terms expansions in a phrase. (Bruno Roustant)
Build Build
* Upgrade forbiddenapis to version 2.7; upgrade Groovy to 2.4.17. (Uwe Schindler) * Upgrade forbiddenapis to version 2.7; upgrade Groovy to 2.4.17. (Uwe Schindler)

View File

@ -292,11 +292,14 @@ public abstract class MultiTermQuery extends Query {
*/ */
protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException; protected abstract TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException;
/** Convenience method, if no attributes are needed: /**
* This simply passes empty attributes and is equal to: * Constructs an enumeration that expands the pattern term.
* <code>getTermsEnum(terms, new AttributeSource())</code> * This method should only be called if the field exists (ie,
* implementations can assume the field does exist).
* This method never returns null.
* The returned TermsEnum is positioned to the first matching term.
*/ */
protected final TermsEnum getTermsEnum(Terms terms) throws IOException { public final TermsEnum getTermsEnum(Terms terms) throws IOException {
return getTermsEnum(terms, new AttributeSource()); return getTermsEnum(terms, new AttributeSource());
} }

View File

@ -332,6 +332,22 @@ public class PhraseQuery extends Query {
} }
} }
public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, List<Term> terms) {
this.postings = postings;
this.impacts = impacts;
this.position = position;
nTerms = terms == null ? 0 : terms.size();
if (nTerms > 0) {
Term[] terms2 = terms.toArray(new Term[0]);
if (nTerms > 1) {
Arrays.sort(terms2);
}
this.terms = terms2;
} else {
this.terms = null;
}
}
@Override @Override
public int compareTo(PostingsAndFreq other) { public int compareTo(PostingsAndFreq other) {
if (position != other.position) { if (position != other.position) {

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,570 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import static org.apache.lucene.search.PhraseWildcardQuery.TestCounters;
/**
* Tests {@link PhraseWildcardQuery}.
* <p>
* The main goal of this class is to verify that {@link PhraseWildcardQuery}
* has the same ranking and same scoring than both {@link MultiPhraseQuery}
* and {@link SpanNearQuery}.
* <p>
* Note that the ranking and scoring are equal if the segment optimization
* is disabled, otherwise it may change the score, but the ranking is most
* often the same.
*/
public class TestPhraseWildcardQuery extends LuceneTestCase {
protected static final int MAX_DOCS = 1000;
protected static final String[] FIELDS = {"title", "author", "category", "other"};
protected Directory directory;
protected IndexReader reader;
protected IndexSearcher searcher;
protected boolean differentScoreExpectedForSpanNearQuery;
@Override
public void setUp() throws Exception {
super.setUp();
directory = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), directory);
iw.setDoRandomForceMerge(false); // Keep the segments separated.
addSegments(iw);
reader = iw.getReader();
iw.close();
searcher = newSearcher(reader);
}
@Override
public void tearDown() throws Exception {
reader.close();
directory.close();
super.tearDown();
}
public void testOneMultiTerm() throws Exception {
searchAndCheckResults(field(1), 100, "eric", "br*");
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
assertEquals(1, TestCounters.get().multiTermAnalysisCount);
assertEquals(4, TestCounters.get().segmentUseCount);
assertEquals(0, TestCounters.get().segmentSkipCount);
}
public void testTwoMultiTerms() throws Exception {
searchAndCheckResults(field(1), 100, "e*", "b*");
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
assertEquals(2, TestCounters.get().multiTermAnalysisCount);
assertEquals(4, TestCounters.get().segmentUseCount);
assertEquals(0, TestCounters.get().segmentSkipCount);
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> {
searchAndCheckResults(field(2), 100, "tim*", "t*");
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
assertEquals(2, TestCounters.get().multiTermAnalysisCount);
assertEquals(2, TestCounters.get().segmentUseCount);
assertEquals(1, TestCounters.get().segmentSkipCount);
});
}
public void testThreeMultiTerms() throws Exception {
searchAndCheckResults(field(0), 100, "t*", "ut?pi?", "e*");
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
assertEquals(3, TestCounters.get().multiTermAnalysisCount);
assertEquals(4, TestCounters.get().segmentUseCount);
assertEquals(1, TestCounters.get().segmentSkipCount);
searchAndCheckResults(field(0), 100, "t?e", "u*", "e*");
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
assertEquals(3, TestCounters.get().multiTermAnalysisCount);
assertEquals(4, TestCounters.get().segmentUseCount);
assertEquals(1, TestCounters.get().segmentSkipCount);
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> {
searchAndCheckResults(field(0), 100, "t?e", "b*", "b*");
assertEquals(0, TestCounters.get().singleTermAnalysisCount);
assertEquals(3, TestCounters.get().multiTermAnalysisCount);
assertEquals(4, TestCounters.get().segmentUseCount);
assertEquals(1, TestCounters.get().segmentSkipCount);
});
}
public void testOneSingleTermTwoMultiTerms() throws Exception {
searchAndCheckResults(field(0), 100, "t*", "utopia", "e*");
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
assertEquals(2, TestCounters.get().multiTermAnalysisCount);
assertEquals(4, TestCounters.get().segmentUseCount);
assertEquals(1, TestCounters.get().segmentSkipCount);
searchAndCheckResults(field(0), 100, "t?e", "utopia", "e*");
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
assertEquals(2, TestCounters.get().multiTermAnalysisCount);
assertEquals(4, TestCounters.get().segmentUseCount);
assertEquals(1, TestCounters.get().segmentSkipCount);
searchAndCheckResults(field(0), 100, "t?a", "utopia", "e*");
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
assertEquals(1, TestCounters.get().multiTermAnalysisCount);
assertEquals(3, TestCounters.get().segmentUseCount);
assertEquals(2, TestCounters.get().segmentSkipCount);
}
public void testTermDoesNotMatch() throws Exception {
searchAndCheckResults(field(0), 100, "nomatch", "e*");
// We expect that createWeight() is not called because the first term does
// not match so the query is early stopped without multi-term expansion.
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
assertEquals(0, TestCounters.get().multiTermAnalysisCount);
assertEquals(2, TestCounters.get().segmentUseCount);
assertEquals(2, TestCounters.get().segmentSkipCount);
searchAndCheckResults(field(0), 100, "t*", "nomatch", "e*");
assertEquals(1, TestCounters.get().singleTermAnalysisCount);
assertEquals(0, TestCounters.get().multiTermAnalysisCount);
assertEquals(2, TestCounters.get().segmentUseCount);
assertEquals(2, TestCounters.get().segmentSkipCount);
}
public void testNoMultiTerm() throws Exception {
searchAndCheckResults(field(0), 100, "the", "utopia");
searchAndCheckResults(field(0), 100, "utopia", "the");
searchAndCheckResults(field(0), 100, "the", "experiment");
}
public void testMaxExpansions() throws Exception {
// The limit on the number of expansions is different with PhraseWildcardQuery
// because it applies to each segments individually, and not globally unlike
// MultiPhraseQuery and SpanMultiTermQueryWrapper.
// Here we verify the total number of expansions directly from test stats
// inside PhraseWildcardQuery.
clearTestCounters();
searcher.search(phraseWildcardQuery(field(1), 3, 0, true, "e*", "b*"), MAX_DOCS);
// We expect 3 expansions even if both multi-terms have potentially more expansions.
assertEquals(3, TestCounters.get().expansionCount);
clearTestCounters();
searcher.search(phraseWildcardQuery(field(0), 4, 0, true, "t?e", "utopia", "e*"), MAX_DOCS);
// We expect 2 expansions since the "utopia" term matches only in the
// first segment, so there is no expansion for the second segment.
assertEquals(2, TestCounters.get().expansionCount);
}
public void testSegmentOptimizationSingleField() throws Exception {
searchAndCheckResults(field(0), 100, 0, true, "b*", "e*");
// Both multi-terms are present in both segments.
// So expecting 4 segment accesses.
assertEquals(4, TestCounters.get().segmentUseCount);
assertEquals(0, TestCounters.get().segmentSkipCount);
assertEquals(0, TestCounters.get().queryEarlyStopCount);
searchAndCheckResults(field(0), 100, 0, true, "t?e", "b*", "e*");
// "t?e" matches only in the first segment. This term adds 2 segment accesses and 1 segment skip.
// The other multi-terms match in the first segment. Each one adds 1 segment access.
// So expecting 3 segment accesses and 1 segment skips.
assertEquals(4, TestCounters.get().segmentUseCount);
assertEquals(1, TestCounters.get().segmentSkipCount);
assertEquals(0, TestCounters.get().queryEarlyStopCount);
searchAndCheckResults(field(0), 100, 0, true, "t?e", "blind", "e*");
assertEquals(3, TestCounters.get().segmentUseCount);
assertEquals(2, TestCounters.get().segmentSkipCount);
assertEquals(1, TestCounters.get().queryEarlyStopCount);
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() -> {
searchAndCheckResults(field(2), 100, 0, true, "tim*", "t*");
assertEquals(2, TestCounters.get().segmentUseCount);
assertEquals(1, TestCounters.get().segmentSkipCount);
assertEquals(0, TestCounters.get().queryEarlyStopCount);
});
}
public void testMultiplePhraseWildcards() throws Exception {
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
new String[]{"e*", "b*"},
new String[]{"t?e", "utopia"}
});
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
new String[]{"e*", "b*"},
new String[]{"d*", "b*"}
});
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
new String[]{"e*", "b*"},
new String[]{"t?e", "utopia"},
new String[]{"d*", "b*"}
});
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() ->
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
new String[]{"e*", "b*"},
new String[]{"b*", "b*"}
}));
expectDifferentScoreForSpanNearQueryWithMultiTermSubset(() ->
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
new String[]{"e*", "b*"},
new String[]{"b*", "b*"},
new String[]{"t?e", "utopia"}
}));
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
new String[]{"e*", "b*"},
new String[]{"e*", "b*"}
});
searchAndCheckResultsMultiplePhraseWildcards(new String[]{field(1), field(0), field(3)}, 100, 0, new String[][]{
new String[]{"e*", "b*"},
new String[]{"t?e", "utopia"},
new String[]{"e*", "b*"}
});
}
public void testToString() {
Query testQuery = phraseWildcardQuery(field(0), 100, 0, true, "t?e", "b*", "e*");
assertEquals("phraseWildcard(title:\"t?e b* e*\")", testQuery.toString());
testQuery = phraseWildcardQuery(field(0), 100, 1, true, "t?e", "utopia", "e*");
assertEquals("phraseWildcard(\"t?e utopia e*\"~1)", testQuery.toString(field(0)));
testQuery = phraseWildcardQuery(field(0), 100, 1, true, "t?e", "b*", "b*");
assertEquals("phraseWildcard(\"t?e b* b*\"~1)", testQuery.toString(field(0)));
}
public void testExplain() throws IOException {
Query testQuery = phraseWildcardQuery(field(0), 100, 0, true, "t?e", "b*", "b*");
// Verify the standard way to get the query explanation.
for (ScoreDoc scoreDoc : searcher.search(testQuery, MAX_DOCS).scoreDocs) {
Explanation explanation = searcher.explain(testQuery, scoreDoc.doc);
assertTrue(explanation.getValue().doubleValue() > 0);
assertEquals("weight(phraseWildcard(title:\"t?e b* b*\") in 1) [AssertingSimilarity], result of:", explanation.getDescription());
}
// Verify that if we call PhraseWildcardQuery.PhraseWildcardWeight.scorer() twice,
// the scoring is correct (even if it is not the standard path expected by the scorer() method).
int resultCount = 0;
Weight weight = testQuery.createWeight(searcher, ScoreMode.TOP_SCORES, 1);
for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) {
Scorer scorer = weight.scorer(leafReaderContext);
if (scorer != null) {
DocIdSetIterator iterator = scorer.iterator();
while (iterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
resultCount++;
}
}
}
assertEquals(1, resultCount);
int explanationWithNonNullScoreCount = 0;
for (LeafReaderContext leafReaderContext : searcher.getIndexReader().leaves()) {
Explanation explanation = weight.explain(leafReaderContext, 1);
if (explanation.getValue().doubleValue() > 0) {
explanationWithNonNullScoreCount++;
}
}
assertEquals(1, explanationWithNonNullScoreCount);
}
/**
* With two similar multi-terms which expansions are subsets (e.g. "tim*" and "t*"),
* we expect {@link PhraseWildcardQuery} and {@link MultiPhraseQuery} to
* have the same scores, but {@link SpanNearQuery} scores are different.
*/
protected void expectDifferentScoreForSpanNearQueryWithMultiTermSubset(RunnableWithIOException runnable) throws IOException {
try {
differentScoreExpectedForSpanNearQuery = true;
runnable.run();
} finally {
differentScoreExpectedForSpanNearQuery = false;
}
}
/**
* Compares {@link PhraseWildcardQuery} to both {@link MultiPhraseQuery}
* and {@link SpanNearQuery}.
*/
protected void searchAndCheckResults(String field, int maxExpansions, String... terms) throws IOException {
for (int slop = 0; slop <= 1; slop++) {
searchAndCheckResults(field, maxExpansions, slop, false, terms);
searchAndCheckResults(field, maxExpansions, slop, true, terms);
}
}
protected void searchAndCheckResults(String field, int maxExpansions, int slop,
boolean segmentOptimizationEnabled, String... terms) throws IOException {
searchAndCheckSameResults(
phraseWildcardQuery(field, maxExpansions, slop, segmentOptimizationEnabled, terms),
multiPhraseQuery(field, maxExpansions, slop, terms),
spanNearQuery(field, slop, terms),
segmentOptimizationEnabled);
}
protected void searchAndCheckResultsMultiplePhraseWildcards(String[] fields, int maxExpansions,
int slop, String[][] multiPhraseTerms) throws IOException {
searchAndCheckResultsMultiplePhraseWildcards(fields, maxExpansions, slop, false, multiPhraseTerms);
searchAndCheckResultsMultiplePhraseWildcards(fields, maxExpansions, slop, true, multiPhraseTerms);
}
protected void searchAndCheckResultsMultiplePhraseWildcards(String[] fields, int maxExpansions, int slop,
boolean segmentOptimizationEnabled, String[][] multiPhraseTerms) throws IOException {
BooleanQuery.Builder phraseWildcardQueryBuilder = new BooleanQuery.Builder();
BooleanQuery.Builder multiPhraseQueryBuilder = new BooleanQuery.Builder();
BooleanQuery.Builder spanNearQueryBuilder = new BooleanQuery.Builder();
for (String[] terms : multiPhraseTerms) {
BooleanClause.Occur occur = random().nextBoolean() ? BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
phraseWildcardQueryBuilder.add(disMaxQuery(phraseWildcardQueries(fields, maxExpansions, slop, segmentOptimizationEnabled, terms)), occur);
multiPhraseQueryBuilder.add(disMaxQuery(multiPhraseQueries(fields, maxExpansions, slop, terms)), occur);
spanNearQueryBuilder.add(disMaxQuery(spanNearQueries(fields, slop, terms)), occur);
}
searchAndCheckSameResults(
phraseWildcardQueryBuilder.build(),
multiPhraseQueryBuilder.build(),
spanNearQueryBuilder.build(),
segmentOptimizationEnabled
);
}
protected Query disMaxQuery(Query... disjuncts) {
return new DisjunctionMaxQuery(Arrays.asList(disjuncts), 0.1f);
}
protected Query[] phraseWildcardQueries(String[] fields, int maxExpansions, int slop, boolean segmentOptimizationEnabled, String... terms) {
Query[] queries = new Query[fields.length];
for (int i = 0; i < fields.length; i++) {
queries[i] = phraseWildcardQuery(fields[i], maxExpansions, slop, segmentOptimizationEnabled, terms);
}
return queries;
}
protected Query[] multiPhraseQueries(String[] fields, int maxExpansions, int slop, String... terms) throws IOException {
Query[] queries = new Query[fields.length];
for (int i = 0; i < fields.length; i++) {
queries[i] = multiPhraseQuery(fields[i], maxExpansions, slop, terms);
}
return queries;
}
protected Query[] spanNearQueries(String[] fields, int slop, String... terms) {
Query[] queries = new Query[fields.length];
for (int i = 0; i < fields.length; i++) {
queries[i] = spanNearQuery(fields[i], slop, terms);
}
return queries;
}
protected void searchAndCheckSameResults(Query testQuery, Query multiPhraseQuery, Query spanNearQuery, boolean segmentOptimizationEnabled) throws IOException {
// Search and compare results with MultiPhraseQuery.
// Do not compare the scores if the segment optimization is enabled because
// it changes the score (but not the result ranking).
boolean sameScoreExpected = !segmentOptimizationEnabled;
searchAndCheckSameResults(testQuery, multiPhraseQuery, sameScoreExpected);
// Clear the test stats to verify them only with the last test query execution.
clearTestCounters();
// Search and compare results with SpanNearQuery.
sameScoreExpected = !segmentOptimizationEnabled && !differentScoreExpectedForSpanNearQuery;
searchAndCheckSameResults(testQuery, spanNearQuery, sameScoreExpected);
}
protected void clearTestCounters() {
TestCounters.get().clear();
}
protected void searchAndCheckSameResults(Query testQuery, Query referenceQuery,
boolean compareScores) throws IOException {
ScoreDoc[] testResults = searcher.search(testQuery, MAX_DOCS).scoreDocs;
ScoreDoc[] referenceResults = searcher.search(referenceQuery, MAX_DOCS).scoreDocs;
assertEquals("Number of results differ when comparing to " + referenceQuery.getClass().getSimpleName(),
referenceResults.length, testResults.length);
if (compareScores) {
for (int i = 0; i < testResults.length; i++) {
ScoreDoc testResult = testResults[i];
ScoreDoc referenceResult = referenceResults[i];
assertTrue("Result " + i + " differ when comparing to " + referenceQuery.getClass().getSimpleName()
+ "\ntestResults=" + Arrays.toString(testResults) + "\nreferenceResults=" + Arrays.toString(referenceResults),
equals(testResult, referenceResult));
}
} else {
Set<Integer> testResultDocIds = Arrays.stream(testResults).map(scoreDoc -> scoreDoc.doc).collect(Collectors.toSet());
Set<Integer> referenceResultDocIds = Arrays.stream(referenceResults).map(scoreDoc -> scoreDoc.doc).collect(Collectors.toSet());
assertEquals("Results differ when comparing to " + referenceQuery.getClass().getSimpleName()
+ " ignoring score\ntestResults=" + Arrays.toString(testResults) + "\nreferenceResults=" + Arrays.toString(referenceResults),
referenceResultDocIds, testResultDocIds);
}
}
protected PhraseWildcardQuery phraseWildcardQuery(String field, int maxExpansions,
int slop, boolean segmentOptimizationEnabled, String... terms) {
PhraseWildcardQuery.Builder builder = createPhraseWildcardQueryBuilder(field, maxExpansions, segmentOptimizationEnabled)
.setSlop(slop);
for (String term : terms) {
if (term.contains("*") || term.contains("?")) {
builder.addMultiTerm(new WildcardQuery(new Term(field, term)));
} else {
builder.addTerm(new BytesRef(term));
}
}
return builder.build();
}
protected PhraseWildcardQuery.Builder createPhraseWildcardQueryBuilder(
String field, int maxExpansions, boolean segmentOptimizationEnabled) {
return new PhraseWildcardQuery.Builder(field, maxExpansions, segmentOptimizationEnabled);
}
protected SpanNearQuery spanNearQuery(String field, int slop, String... terms) {
SpanQuery[] spanQueries = new SpanQuery[terms.length];
for (int i = 0; i < terms.length; i++) {
String term = terms[i];
spanQueries[i] = term.contains("*") || term.contains("?") ?
new SpanMultiTermQueryWrapper<>(new WildcardQuery(new Term(field, term)))
: new SpanTermQuery(new Term(field, term));
}
return new SpanNearQuery(spanQueries, slop, true);
}
protected MultiPhraseQuery multiPhraseQuery(String field, int maxExpansions, int slop, String... terms) throws IOException {
MultiPhraseQuery.Builder builder = new MultiPhraseQuery.Builder()
.setSlop(slop);
for (String term : terms) {
if (term.contains("*") || term.contains("?")) {
Term[] expansions = expandMultiTerm(field, term, maxExpansions);
if (expansions.length > 0) {
builder.add(expansions);
} else {
builder.add(new Term(field, "non-matching-term"));
}
} else {
builder.add(new Term(field, term));
}
}
return builder.build();
}
protected Term[] expandMultiTerm(String field, String term, int maxExpansions) throws IOException {
if (maxExpansions == 0) {
return new Term[0];
}
Set<Term> expansions = new HashSet<>();
WildcardQuery wq = new WildcardQuery(new Term(field, term));
expansion:
for (final LeafReaderContext ctx : reader.leaves()) {
Terms terms = ctx.reader().terms(field);
if (terms != null) {
TermsEnum termsEnum = wq.getTermsEnum(terms);
while (termsEnum.next() != null) {
expansions.add(new Term(field, termsEnum.term()));
if (expansions.size() >= maxExpansions) {
break expansion;
}
}
}
}
return expansions.toArray(new Term[0]);
}
protected static boolean equals(ScoreDoc result1, ScoreDoc result2) {
// Due to randomness, the value of the score comparison epsilon varies much.
// We take 1E-1 epsilon to ensure the test do not flap.
return result1.doc == result2.doc && (Math.abs(result1.score - result2.score) < 1E-1);
}
protected void addSegments(RandomIndexWriter iw) throws IOException {
// First segment.
addDocs(iw,
doc(
field(field(0), "time conversion"),
field(field(1), "eric hawk"),
field(field(2), "time travel")
),
doc(
field(field(0), "the blinking books"),
field(field(1), "donald ever"),
field(field(2), "time travel")
),
doc(
field(field(0), "the utopia experiment"),
field(field(1), "dylan brief"),
field(field(2), "utopia"),
field(field(3), "travelling to utopiapolis")
)
);
iw.commit();
// Second segment.
// No field(2).
addDocs(iw,
doc(
field(field(0), "serene evasion"),
field(field(1), "eric brown")
),
doc(
field(field(0), "my blind experiment"),
field(field(1), "eric bright")
),
doc(
field(field(3), "two times travel")
)
);
iw.commit();
}
protected String field(int index) {
return FIELDS[index];
}
protected static void addDocs(RandomIndexWriter iw, Document... docs) throws IOException {
iw.addDocuments(Arrays.asList(docs));
}
protected static Document doc(Field... fields) {
Document doc = new Document();
for (Field field : fields) {
doc.add(field);
}
return doc;
}
protected static Field field(String field, String fieldValue) {
return newTextField(field, fieldValue, Field.Store.NO);
}
private interface RunnableWithIOException {
void run() throws IOException;
}
}